In [156]:
import pandas as pd
import ast

In [157]:
original_dataframe = pd.read_csv("../data/full_import_dataset/full_import_dataset_with_libname.csv", index_col=0)
original_dataframe.head()

Unnamed: 0,project_name,import,is_kotlin_import,is_python_import,package
0,2dust#v2rayNG,androidx.appcompat.app.AppCompatActivity,1,0,androidx.appcompat
1,2dust#v2rayNG,android.view.MenuItem,1,0,android.view
2,2dust#v2rayNG,android.Manifest,1,0,android
3,2dust#v2rayNG,android.content.Intent,1,0,android.content
4,2dust#v2rayNG,android.net.Uri,1,0,android.net


In [158]:
# Count diff imports of packages in projects
intermediate_dataframe = original_dataframe \
    .groupby(['project_name', 'package']) \
    .agg({'import':'count', 'is_kotlin_import':'first', 'is_python_import':'first'}) \
    .reset_index() \
    .rename(columns={'import':'count_different_import', 'is_kotlin_import':'is_kotlin_package', 'is_python_import':'is_python_package'})

intermediate_dataframe.head()

Unnamed: 0,project_name,package,count_different_import,is_kotlin_package,is_python_package
0,01joy#news-search-engine,bs4,1,0,1
1,01joy#news-search-engine,datetime,2,0,1
2,01joy#news-search-engine,flask,3,0,1
3,01joy#news-search-engine,index,1,0,1
4,01joy#news-search-engine,jieba,1,0,1


In [159]:
#Check for collision - it's ok
intermediate_dataframe[intermediate_dataframe["is_kotlin_package"] == intermediate_dataframe["is_python_package"]]

Unnamed: 0,project_name,package,count_different_import,is_kotlin_package,is_python_package


In [160]:
# Drop out 'is_kotlin_package' and 'is_python_package' since there are no collisions

## Package features extraction

In [161]:
# Rename each package name to avoid name collision with tag names
intermediate_dataframe.package = intermediate_dataframe.package.apply(lambda x: f"package#{x}")
intermediate_dataframe.head()

Unnamed: 0,project_name,package,count_different_import,is_kotlin_package,is_python_package
0,01joy#news-search-engine,package#bs4,1,0,1
1,01joy#news-search-engine,package#datetime,2,0,1
2,01joy#news-search-engine,package#flask,3,0,1
3,01joy#news-search-engine,package#index,1,0,1
4,01joy#news-search-engine,package#jieba,1,0,1


In [162]:
pivot_package_dataframe = intermediate_dataframe.pivot_table(index="project_name", columns="package", values="count_different_import", fill_value=0)
pivot_package_dataframe.head()

package,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,package#APDrawingGAN.models,...,package#zwave_js_server.model.utils,package#zwave_js_server.model.value,package#zwave_js_server.util.command_class,package#zwave_js_server.util.lock,package#zwave_js_server.util.multicast,package#zwave_js_server.util.node,package#zwave_js_server.version,package#zwave_me_ws,package#zxcvbn,package#zypp_plugin
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
pivot_package_dataframe.shape

(2853, 15383)

In [164]:
# So it's ok
pivot_package_dataframe["package#APDrawingGAN.models"].value_counts()

0    2852
1       1
Name: package#APDrawingGAN.models, dtype: int64

## Extencion features extraction

In [165]:
# Load ext count dataset
ext_count_dataset = pd.read_csv("../data/ext_count/full_ext_count_dataset.csv", index_col=0)
ext_count_dataset.head()

Unnamed: 0,project_name,ext,count
0,donnemartin#gitsome,.yml,4
1,donnemartin#gitsome,,30
2,donnemartin#gitsome,.md,10
3,donnemartin#gitsome,.py,345
4,donnemartin#gitsome,.txt,3


In [166]:
ext_count_dataset.shape

(79546, 3)

In [167]:
ext_count_dataset = ext_count_dataset.dropna()

In [168]:
ext_count_dataset.shape

(75348, 3)

In [169]:
ext_count_dataset.ext = ext_count_dataset.ext.apply(lambda x: f"extension#{x}")
ext_count_dataset.head()

Unnamed: 0,project_name,ext,count
0,donnemartin#gitsome,extension#.yml,4
2,donnemartin#gitsome,extension#.md,10
3,donnemartin#gitsome,extension#.py,345
4,donnemartin#gitsome,extension#.txt,3
5,donnemartin#gitsome,extension#.ini,1


In [170]:
pivot_ext_count_dataset = ext_count_dataset.pivot_table(index="project_name", columns="ext", values="count", fill_value=0)
pivot_ext_count_dataset.head()

ext,extension#.,extension#.0,extension#.0+,extension#.0-AT,extension#.0-Combined-Work-Exception,extension#.0-DE,extension#.0-FR,extension#.0-IGO,extension#.0-NL,extension#.0-Perl,...,extension#.zktx,extension#.zlib,extension#.zmpl,extension#.zone,extension#.zoneinfo,extension#.zpln,extension#.zraw,extension#.zsh,extension#.zst,extension#.zzz
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
pivot_ext_count_dataset.shape

(3404, 3852)

In [172]:
# So it's ok
pivot_ext_count_dataset["extension#.0-Perl"].value_counts()

0    3403
1       1
Name: extension#.0-Perl, dtype: int64

## Kaggle dataset

In [173]:
# Load kaggle_repos dataframe
kaggle_repos_dataframe = pd.read_csv("../data/kaggle_repos.csv")
kaggle_repos_dataframe.head()

Unnamed: 0,topic,name,owner,owner_type,full_name,description,og_image,license,is_archived,is_forked,...,open_issues,forks,stars,watchers,has_wiki,has_pages,has_sponsorship,created_at,updated_at,repo
0,3d,SpaceshipGenerator,a1studmuffin,User,a1studmuffin/SpaceshipGenerator,A Blender script to procedurally generate 3D s...,,Other,False,False,...,13,391,7100,7100,True,False,False,2016-06-14T03:16:07Z,2022-04-06T16:55:48Z,https://github.com/a1studmuffin/SpaceshipGener...
1,3d,BlenderGIS,domlysz,User,domlysz/BlenderGIS,,,GNU General Public License v3.0,False,False,...,155,923,5047,5047,True,False,False,2014-05-08T14:48:25Z,2022-04-08T00:59:56Z,https://github.com/domlysz/BlenderGIS
2,3d,PRNet,YadiraF,User,YadiraF/PRNet,Joint 3D Face Reconstruction and Dense Alignme...,,MIT License,False,False,...,160,919,4574,4574,True,False,False,2018-03-20T11:44:06Z,2022-04-08T03:00:19Z,https://github.com/YadiraF/PRNet
3,3d,3DDFA,cleardusk,User,cleardusk/3DDFA,The PyTorch improved version of TPAMI 2017 pap...,,MIT License,False,False,...,51,621,3223,3223,False,False,False,2018-06-29T14:19:21Z,2022-04-07T07:47:09Z,https://github.com/cleardusk/3DDFA
4,3d,armory,armory3d,Organization,armory3d/armory,,,zlib License,False,False,...,310,263,2421,2421,True,False,False,2015-10-29T18:27:56Z,2022-04-06T21:17:40Z,https://github.com/armory3d/armory


In [174]:
kaggle_repos_dataframe.columns

Index(['topic', 'name', 'owner', 'owner_type', 'full_name', 'description',
       'og_image', 'license', 'is_archived', 'is_forked', 'size', 'language',
       'tags', 'open_issues', 'forks', 'stars', 'watchers', 'has_wiki',
       'has_pages', 'has_sponsorship', 'created_at', 'updated_at', 'repo'],
      dtype='object')

In [175]:
# Prepare column for join
kaggle_repos_dataframe["project_name"] = kaggle_repos_dataframe["owner"] + "#" + kaggle_repos_dataframe["name"]
kaggle_repos_dataframe["project_name"].head()

0    a1studmuffin#SpaceshipGenerator
1                 domlysz#BlenderGIS
2                      YadiraF#PRNet
3                    cleardusk#3DDFA
4                    armory3d#armory
Name: project_name, dtype: object

## Repos tags extraction

In [176]:
kaggle_repos_dataframe_tags = kaggle_repos_dataframe[["project_name", "tags"]]
kaggle_repos_dataframe_tags.head()

Unnamed: 0,project_name,tags
0,a1studmuffin#SpaceshipGenerator,"['python', 'procedural-generation', 'blender-s..."
1,domlysz#BlenderGIS,[]
2,YadiraF#PRNet,"['swap', 'face', 'alignment', 'reconstruction'..."
3,cleardusk#3DDFA,"['python', 'computer-vision', 'deep-learning',..."
4,armory3d#armory,[]


In [177]:
kaggle_repos_dataframe_tags.tags = kaggle_repos_dataframe_tags.tags.apply(ast.literal_eval)
kaggle_repos_dataframe_tags.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,project_name,tags
0,a1studmuffin#SpaceshipGenerator,"[python, procedural-generation, blender-script..."
1,domlysz#BlenderGIS,[]
2,YadiraF#PRNet,"[swap, face, alignment, reconstruction, 3d]"
3,cleardusk#3DDFA,"[python, computer-vision, deep-learning, pytor..."
4,armory3d#armory,[]


In [178]:
# Prepare dataset for pivot
kaggle_repos_dataframe_tags = kaggle_repos_dataframe_tags.explode("tags")
kaggle_repos_dataframe_tags.head()

Unnamed: 0,project_name,tags
0,a1studmuffin#SpaceshipGenerator,python
0,a1studmuffin#SpaceshipGenerator,procedural-generation
0,a1studmuffin#SpaceshipGenerator,blender-scripts
0,a1studmuffin#SpaceshipGenerator,game-development
0,a1studmuffin#SpaceshipGenerator,3d


In [179]:
kaggle_repos_dataframe_tags["val"] = 1
kaggle_repos_dataframe_tags.head()

Unnamed: 0,project_name,tags,val
0,a1studmuffin#SpaceshipGenerator,python,1
0,a1studmuffin#SpaceshipGenerator,procedural-generation,1
0,a1studmuffin#SpaceshipGenerator,blender-scripts,1
0,a1studmuffin#SpaceshipGenerator,game-development,1
0,a1studmuffin#SpaceshipGenerator,3d,1


In [180]:
# Rename each tag name to avoid name collision with package names
kaggle_repos_dataframe_tags.tags = kaggle_repos_dataframe_tags.tags.apply(lambda x: f"tag#{x}")
kaggle_repos_dataframe_tags.head()

Unnamed: 0,project_name,tags,val
0,a1studmuffin#SpaceshipGenerator,tag#python,1
0,a1studmuffin#SpaceshipGenerator,tag#procedural-generation,1
0,a1studmuffin#SpaceshipGenerator,tag#blender-scripts,1
0,a1studmuffin#SpaceshipGenerator,tag#game-development,1
0,a1studmuffin#SpaceshipGenerator,tag#3d,1


In [181]:
# Total number of tags
kaggle_repos_dataframe_tags.tags.unique().size

9148

In [182]:
kaggle_repos_dataframe_tags.project_name.unique().size

3418

In [183]:
pivot_tags_dataframe = kaggle_repos_dataframe_tags.pivot_table(index="project_name", values="val", columns="tags", fill_value=0)
pivot_tags_dataframe.head()

tags,tag#010editor,tag#053,tag#0day,tag#100daysofcode,tag#104,tag#1111,tag#12306,tag#163mail-login,tag#2,tag#2019-ncov,...,tag#zircon,tag#zookeeper,tag#zoom,tag#zotero,tag#zotero-api,tag#zsh,tag#ztp,tag#zuul,tag#zvt,tag#zypper
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
set(pivot_tags_dataframe.columns).difference(set(kaggle_repos_dataframe.tags.unique()))

{'tag#learning-python',
 'tag#elastic-net',
 'tag#android-studio-poet',
 'tag#crm-system',
 'tag#fabric-tasks',
 'tag#apriori',
 'tag#acra',
 'tag#ggplot',
 'tag#satinder',
 'tag#data-versioning',
 'tag#myserver-start',
 'tag#handlebars',
 'tag#digipres',
 'tag#labs',
 'tag#basic',
 'tag#myo',
 'tag#workflow-engine',
 'tag#syscall-table',
 'tag#io',
 'tag#convolutional-neural-network',
 'tag#neural-search',
 'tag#yolov5',
 'tag#django-intercoolerjs',
 'tag#ansible-inventory',
 'tag#document-classification',
 'tag#user-registration',
 'tag#simplelogin',
 'tag#hellogithub',
 'tag#knowledge-defined-networking',
 'tag#arch-linux',
 'tag#exercises',
 'tag#image-download',
 'tag#naivebayes',
 'tag#sre',
 'tag#nuclei-checks',
 'tag#media-player',
 'tag#experiment-track',
 'tag#awsmfa',
 'tag#multipleaccounts',
 'tag#archivebox',
 'tag#markdownx',
 'tag#triton',
 'tag#fontforge',
 'tag#privacy-online',
 'tag#termux-ubuntu',
 'tag#fragments',
 'tag#human-action-recognition',
 'tag#news-aggregat

In [185]:
pivot_tags_dataframe.shape

(3418, 9148)

In [187]:
# Check values
pivot_tags_dataframe["tag#haskell"].value_counts()

0    3413
1       5
Name: tag#haskell, dtype: int64

## Repos topics extraction

In [188]:
kaggle_repos_dataframe_topics = kaggle_repos_dataframe[["project_name", "topic"]]
kaggle_repos_dataframe_topics.head()

Unnamed: 0,project_name,topic
0,a1studmuffin#SpaceshipGenerator,3d
1,domlysz#BlenderGIS,3d
2,YadiraF#PRNet,3d
3,cleardusk#3DDFA,3d
4,armory3d#armory,3d


In [189]:
kaggle_repos_dataframe_topics["val"] = 1
kaggle_repos_dataframe_topics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kaggle_repos_dataframe_topics["val"] = 1


Unnamed: 0,project_name,topic,val
0,a1studmuffin#SpaceshipGenerator,3d,1
1,domlysz#BlenderGIS,3d,1
2,YadiraF#PRNet,3d,1
3,cleardusk#3DDFA,3d,1
4,armory3d#armory,3d,1


In [190]:
kaggle_repos_dataframe_topics.topic = kaggle_repos_dataframe_topics.topic.apply(lambda x: f"topic#{x}")
kaggle_repos_dataframe_topics.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,project_name,topic,val
0,a1studmuffin#SpaceshipGenerator,topic#3d,1
1,domlysz#BlenderGIS,topic#3d,1
2,YadiraF#PRNet,topic#3d,1
3,cleardusk#3DDFA,topic#3d,1
4,armory3d#armory,topic#3d,1


In [191]:
pivot_topics_dataframe = kaggle_repos_dataframe_topics.pivot_table(index="project_name", values="val", columns="topic", fill_value=0)
pivot_topics_dataframe.head()

topic,topic#3d,topic#ajax,topic#algorithm,topic#android,topic#angular,topic#ansible,topic#api,topic#arduino,topic#aspnet,topic#atom,...,topic#vim,topic#virtual-reality,topic#vue,topic#wagtail,topic#web-components,topic#webapp,topic#webpack,topic#windows,topic#wordpress,topic#xml
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Final dataset construction

In [192]:
# Final step - join package columns (features) and tags columns (targets)

In [193]:
# First of all check size of dataframes
pivot_tags_dataframe.shape

(3418, 9148)

In [194]:
pivot_topics_dataframe.shape

(3418, 161)

In [195]:
pivot_package_dataframe.shape

(2853, 15383)

In [196]:
pivot_ext_count_dataset.shape

(3404, 3852)

In [197]:
set(pivot_tags_dataframe.index).difference(set(pivot_package_dataframe.index))

{'1N3#Wordpress-XMLRPC-Brute-Force-Exploit',
 '666wcy#ARPT-Bot',
 'Ank-Cha#Social-Distancing-Analyser-COVID-19',
 'Bhupesh-V#memer-action',
 'CLUEbenchmark#CLUEDatasetSearch',
 'CiKu370#OSIF',
 'Comcast#ansible-sdkman',
 'Dineshkarthik#telegram_media_downloader',
 'DmrfCoder#AlgorithmAndDataStructure',
 'EZFNDEV#EZFN-Lobbybot',
 'EZForever#HiveMiner',
 'FeeiCN#Security-PPT',
 'Hello-Linux#Ansible-VIM-IDE',
 'HoboVR-Labs#hobo_vr',
 'INTERMT#Awesome-PyTorch-Chinese',
 'Jack-Cherish#PythonPark',
 'JalanJiang#leetcode-notebook',
 'Karan-Malik#FaceMaskDetector',
 'Kylmakalle#heroku-telegram-bot',
 'LyleMi#Learn-Web-Hacking',
 'MMehdiMousavi#SuperCaustics',
 'Nanoseb#ncTelegram',
 'Neko250#sublime-PICO-8',
 'Panintelligence#picons',
 'RunaCapital#awesome-oss-alternatives',
 'RyanAWalters#PowerOf2ImageResizer',
 'Singosgu#GreaterWMS',
 'SpEcHiDe#AnyDLBot',
 'SpectralVectors#RightMouseNavigation',
 'TarrySingh#Artificial-Intelligence-Deep-Learning-Machine-Learning-Tutorials',
 'Tautulli#Tautul

In [198]:
set(pivot_tags_dataframe.columns).intersection(set(pivot_package_dataframe.columns))

set()

In [199]:
# Check for package names and tag names intersection
set(pivot_topics_dataframe.columns).intersection(set(pivot_package_dataframe.columns))

set()

In [200]:
set(pivot_tags_dataframe.columns).intersection(set(pivot_ext_count_dataset.columns))

set()

In [201]:
set(pivot_package_dataframe.columns).intersection(set(pivot_ext_count_dataset.columns))

set()

In [202]:
final_dataset = pivot_package_dataframe.join(pivot_ext_count_dataset, on="project_name", how="inner")
final_dataset.head()

Unnamed: 0_level_0,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,package#APDrawingGAN.models,...,extension#.zktx,extension#.zlib,extension#.zmpl,extension#.zone,extension#.zoneinfo,extension#.zpln,extension#.zraw,extension#.zsh,extension#.zst,extension#.zzz
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [203]:
final_dataset = final_dataset.join(pivot_tags_dataframe, on="project_name", how="inner")
final_dataset.head()

Unnamed: 0_level_0,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,package#APDrawingGAN.models,...,tag#zircon,tag#zookeeper,tag#zoom,tag#zotero,tag#zotero-api,tag#zsh,tag#ztp,tag#zuul,tag#zvt,tag#zypper
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [204]:
final_dataset = final_dataset.join(pivot_topics_dataframe, on="project_name", how="inner")
final_dataset.head()

Unnamed: 0_level_0,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,package#APDrawingGAN.models,...,topic#vim,topic#virtual-reality,topic#vue,topic#wagtail,topic#web-components,topic#webapp,topic#webpack,topic#windows,topic#wordpress,topic#xml
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01joy#news-search-engine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05bit#peewee-async,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0k#shyaml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xAX#linux-insides,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [205]:
final_dataset.shape

(2853, 28544)

In [206]:
pivot_tags_dataframe.shape[1] + pivot_package_dataframe.shape[1] + pivot_ext_count_dataset.shape[1] + pivot_topics_dataframe.shape[1]

28544

In [207]:
final_dataset.reset_index(inplace=True)

In [208]:
final_dataset.head()

Unnamed: 0,project_name,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,...,topic#vim,topic#virtual-reality,topic#vue,topic#wagtail,topic#web-components,topic#webapp,topic#webpack,topic#windows,topic#wordpress,topic#xml
0,01joy#news-search-engine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,05bit#peewee-async,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0k#shyaml,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0xAX#linux-insides,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [209]:
final_dataset.to_csv("../data/classification/count_dataset_for_multilabel.csv", index=False)