In [1]:
from utils import load_data
import config as cfg

import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
user_repos = load_data(cfg.USER_REPOS_FILE)

In [3]:
user_repos[1600]

{'username': 'frytg',
 'languages': {'Dockerfile': 0.159,
  'Shell': 0.091,
  'JavaScript': 0.408,
  'TypeScript': 0.092,
  'HTML': 0.169,
  'SCSS': 0.073,
  'Ruby': 0.007},
 'topics': {'cloud-run': 1,
  'cloud-run-button': 1,
  'nginx': 1,
  'redirect-urls': 1,
  'cloudflare-pages': 1,
  'jamstack': 1,
  'jekyll': 1}}

In [4]:
languages = [ur['languages'] for ur in user_repos]
topics = [ur['topics'] for ur in user_repos]

In [5]:
class Summer:
    def __init__(self, d={}):
        if isinstance(d, list):
            self.d = {}
            for el in d:
                self._update(el)
        else:
            self.d = d
            
        self._sort_keys() 
    
    def _sort_keys(self):
        sorted_items = sorted(self.d.items(), key=lambda i: -i[1])
        self.s = [x for x, y in sorted_items]
        self.v = [y for x, y in sorted_items]
        
        
    def _update(self, d_):
        for key in d_:
            try:
                self.d[key] += d_[key]
            except:
                self.d[key] = d_[key]
    
    def update(self, d_):
        self._update(d_)
        self._sort_keys()
    
    def most_common(self, n, with_values=False):
        if n < 1 or (isinstance(n, float) and n == 1):
            n_ = int(self.__len__() * n)
        else:
            n_ = n
        
        if with_values:
            return self.s[:n_], self.v[:n_]
        return self.s[:n_]
    
    def __len__(self):
        return len(self.d)

In [6]:
language_totals = Summer(languages)

In [7]:
common_languages = language_totals.most_common(cfg.N_LANGS)

In [8]:
', '.join(common_languages)

'JavaScript, Python, TypeScript, HTML, Go, Java, PHP, Jupyter Notebook, CSS, Shell, C++, Ruby, C, Rust, C#, Swift, Vue, Kotlin, Dockerfile, SCSS, Dart, Objective-C, R, Makefile, TeX, Lua, Elixir, Scala, Vim Script, HCL, Solidity, MATLAB, Clojure, PowerShell, CMake, Haskell, Emacs Lisp, Svelte, Julia, null, Smarty, Perl, Groovy, Assembly, Cuda, Blade, Nix, CoffeeScript, Less, Batchfile'

In [9]:
len(language_totals)

452

In [10]:
topic_totals = Summer(topics)
common_topics = topic_totals.most_common(cfg.N_TOPICS)

In [11]:
', '.join(common_topics)

'react, config, github-config, typescript, javascript, python, nodejs, deep-learning, machine-learning, golang, docker, hacktoberfest, pytorch, php, java, go, android, nextjs, reactjs, python3, vue, rust, swift, tensorflow, cli, api, ios, blog, css, aws, laravel, computer-vision, html, kotlin, tailwindcss, mongodb, nlp, game, vuejs, angular, react-native, ruby, kubernetes, cpp, webpack, graphql, macos, dotfiles, redis, express, spring-boot, docker-compose, node, algorithms, boilerplate, git, github, github-actions, flutter, mysql, ethereum, blockchain, redux, chrome-extension, c, data-science, flask, json, template, django, swiftui, natural-language-processing, wordpress, advent-of-code, postgresql, rest-api, website, bash, r, scala, reinforcement-learning, html5, portfolio, jest, linux, npm, firebase, vite, visualization, neural-network, eslint, markdown, personal-website, electron, dotnet, data-structures, gatsby, solidity, vim, terraform, csharp, shell, serverless, sql, database, vs

In [12]:
len(topic_totals)

15767

In [13]:
common_languages, common_languages_freq = language_totals.most_common(cfg.N_LANGS, with_values=True)
lang_features = pd.DataFrame([common_languages, common_languages_freq]).T
lang_features = lang_features.rename(columns={0: 'Lang', 1: 'Freq'})
lang_features

Unnamed: 0,Lang,Freq
0,JavaScript,7274.052
1,Python,4962.932
2,TypeScript,3303.555
3,HTML,2456.655
4,Go,1710.501
5,Java,1610.92
6,PHP,1371.427
7,Jupyter Notebook,1304.079
8,CSS,1184.155
9,Shell,1156.497


In [14]:
common_topics, common_topics_freq = topic_totals.most_common(cfg.N_TOPICS, with_values=True)
topic_features = pd.DataFrame([common_topics, common_topics_freq]).T
topic_features = topic_features.rename(columns={0: 'Topic', 1: 'Freq'})
topic_features

Unnamed: 0,Topic,Freq
0,react,569
1,config,534
2,github-config,515
3,typescript,479
4,javascript,453
...,...,...
145,security,29
146,cryptocurrency,29
147,data,29
148,rails,29


In [15]:
def dicts_to_pandas(dicts, key, values):
    rows = []
    for d in dicts:
        row = []
        row.append(d['username'])
        for v in values:
            if v in d[key]:
                row.append(d[key][v])
            else:
                row.append(0)
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=['username'] + values)
    cols = {v: key + '_' + v for v in values}
    df = df.rename(columns=cols)
    return df
    
    

In [16]:
df_langs = dicts_to_pandas(user_repos, 'languages', common_languages)
df_langs

Unnamed: 0,username,languages_JavaScript,languages_Python,languages_TypeScript,languages_HTML,languages_Go,languages_Java,languages_PHP,languages_Jupyter Notebook,languages_CSS,...,languages_Smarty,languages_Perl,languages_Groovy,languages_Assembly,languages_Cuda,languages_Blade,languages_Nix,languages_CoffeeScript,languages_Less,languages_Batchfile
0,SaiNageswarS,0.000,0.000,0.250,0.000,0.500,0.000,0.000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
1,dennissiq,0.187,0.000,0.263,0.335,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
2,guoyoujin,0.000,0.051,0.000,0.000,0.179,0.202,0.000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
3,redshiftzero,0.000,0.423,0.000,0.071,0.000,0.000,0.000,0.0,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
4,pedrofsn,0.147,0.000,0.204,0.030,0.000,0.002,0.000,0.0,0.019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35904,LukeMichaels,0.373,0.000,0.000,0.091,0.000,0.000,0.252,0.0,0.032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
35905,hemlok,0.531,0.000,0.452,0.015,0.000,0.000,0.000,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
35906,sfate,0.005,0.000,0.000,0.027,0.000,0.000,0.000,0.0,0.015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
35907,DemoYolk,0.196,0.000,0.000,0.199,0.244,0.000,0.000,0.0,0.066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0


In [17]:
df_topics = dicts_to_pandas(user_repos, 'topics', common_topics)
df_topics

Unnamed: 0,username,topics_react,topics_config,topics_github-config,topics_typescript,topics_javascript,topics_python,topics_nodejs,topics_deep-learning,topics_machine-learning,...,topics_algorithm,topics_plugin,topics_vercel,topics_music,topics_vue3,topics_security,topics_cryptocurrency,topics_data,topics_rails,topics_twitter
0,SaiNageswarS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,dennissiq,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,guoyoujin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,redshiftzero,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,pedrofsn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35904,LukeMichaels,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35905,hemlok,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35906,sfate,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35907,DemoYolk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_features = df_langs.merge(df_topics, on='username')

In [19]:
df_features

Unnamed: 0,username,languages_JavaScript,languages_Python,languages_TypeScript,languages_HTML,languages_Go,languages_Java,languages_PHP,languages_Jupyter Notebook,languages_CSS,...,topics_algorithm,topics_plugin,topics_vercel,topics_music,topics_vue3,topics_security,topics_cryptocurrency,topics_data,topics_rails,topics_twitter
0,SaiNageswarS,0.000,0.000,0.250,0.000,0.500,0.000,0.000,0.0,0.000,...,0,0,0,0,0,0,0,0,0,0
1,dennissiq,0.187,0.000,0.263,0.335,0.000,0.000,0.000,0.0,0.000,...,0,0,0,0,0,0,0,0,0,0
2,guoyoujin,0.000,0.051,0.000,0.000,0.179,0.202,0.000,0.0,0.000,...,0,0,0,0,0,0,0,0,0,0
3,redshiftzero,0.000,0.423,0.000,0.071,0.000,0.000,0.000,0.0,0.000,...,0,0,0,0,0,0,0,0,0,0
4,pedrofsn,0.147,0.000,0.204,0.030,0.000,0.002,0.000,0.0,0.019,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35904,LukeMichaels,0.373,0.000,0.000,0.091,0.000,0.000,0.252,0.0,0.032,...,0,0,0,0,0,0,0,0,0,0
35905,hemlok,0.531,0.000,0.452,0.015,0.000,0.000,0.000,0.0,0.002,...,0,0,0,0,0,0,0,0,0,0
35906,sfate,0.005,0.000,0.000,0.027,0.000,0.000,0.000,0.0,0.015,...,0,0,0,0,0,0,0,0,0,0
35907,DemoYolk,0.196,0.000,0.000,0.199,0.244,0.000,0.000,0.0,0.066,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df_features.to_csv(cfg.USER_FEATURES_FILE, index=False)

In [20]:
feats = list(df_features.columns[1:])
feats

['languages_JavaScript',
 'languages_Python',
 'languages_TypeScript',
 'languages_HTML',
 'languages_Go',
 'languages_Java',
 'languages_PHP',
 'languages_Jupyter Notebook',
 'languages_CSS',
 'languages_Shell',
 'languages_C++',
 'languages_Ruby',
 'languages_C',
 'languages_Rust',
 'languages_C#',
 'languages_Swift',
 'languages_Vue',
 'languages_Kotlin',
 'languages_Dockerfile',
 'languages_SCSS',
 'languages_Dart',
 'languages_Objective-C',
 'languages_R',
 'languages_Makefile',
 'languages_TeX',
 'languages_Lua',
 'languages_Elixir',
 'languages_Scala',
 'languages_Vim Script',
 'languages_HCL',
 'languages_Solidity',
 'languages_MATLAB',
 'languages_Clojure',
 'languages_PowerShell',
 'languages_CMake',
 'languages_Haskell',
 'languages_Emacs Lisp',
 'languages_Svelte',
 'languages_Julia',
 'languages_null',
 'languages_Smarty',
 'languages_Perl',
 'languages_Groovy',
 'languages_Assembly',
 'languages_Cuda',
 'languages_Blade',
 'languages_Nix',
 'languages_CoffeeScript',
 'lan

In [21]:
pickle.dump(feats, open(cfg.FEATURE_NAMES_FILE, 'wb'))

['languages_JavaScript',
 'languages_Python',
 'languages_TypeScript',
 'languages_HTML',
 'languages_Go',
 'languages_Java',
 'languages_PHP',
 'languages_Jupyter Notebook',
 'languages_CSS',
 'languages_Shell',
 'languages_C++',
 'languages_Ruby',
 'languages_C',
 'languages_Rust',
 'languages_C#',
 'languages_Swift',
 'languages_Vue',
 'languages_Kotlin',
 'languages_Dockerfile',
 'languages_SCSS',
 'languages_Dart',
 'languages_Objective-C',
 'languages_R',
 'languages_Makefile',
 'languages_TeX',
 'languages_Lua',
 'languages_Elixir',
 'languages_Scala',
 'languages_Vim Script',
 'languages_HCL',
 'languages_Solidity',
 'languages_MATLAB',
 'languages_Clojure',
 'languages_PowerShell',
 'languages_CMake',
 'languages_Haskell',
 'languages_Emacs Lisp',
 'languages_Svelte',
 'languages_Julia',
 'languages_null',
 'languages_Smarty',
 'languages_Perl',
 'languages_Groovy',
 'languages_Assembly',
 'languages_Cuda',
 'languages_Blade',
 'languages_Nix',
 'languages_CoffeeScript',
 'lan