In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import FunctionTransformer

## Read Data From File

In [21]:
data = pd.read_csv('../data/training_data.csv', index_col=0)
data.fillna(0, inplace=True)

In [22]:
data.head()

Unnamed: 0,Username,Id,Contributions,JavaScript,Python,Java,C#,PHP,TypeScript,Ruby,...,Dart,Vue,Assembly,Sass,CSS,HTML,Pascal,Racket,Zig,Other
mojombo,1,79,0,1970379,0,0,0,0,2566099,928419,...,0,0,0,0,7758,17442,0,0,0,240167
defunkt,2,1,0,4609687,0,0,0,0,0,470103,...,0,0,0,0,6399,23938,0,0,0,896415
wycats,4,358,0,3378185,0,0,0,0,2687760,952,...,0,0,0,0,21641,441513,0,0,0,3060
brynary,19,3473,0,11661,0,0,0,0,0,951748,...,0,0,0,0,0,17954,0,0,0,3142
kevinclark,20,18,0,0,0,0,0,0,0,43311,...,0,0,0,0,0,0,0,0,0,8688


In [23]:
data.describe()

Unnamed: 0,Username,Id,Contributions,JavaScript,Python,Java,C#,PHP,TypeScript,Ruby,...,Dart,Vue,Assembly,Sass,CSS,HTML,Pascal,Racket,Zig,Other
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,...,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,46.708333,857.958333,0.0,494199.7,46069.708333,2631.166667,0.0,0.0,221971.2,1330818.0,...,0.0,8326.041667,0.0,0.0,60714.38,67725.916667,0.0,0.0,0.0,2230629.0
std,28.082919,1260.347243,0.0,1180305.0,177143.585184,12890.031523,0.0,0.0,740997.9,4241499.0,...,0.0,40789.107321,0.0,0.0,213681.4,168493.736747,0.0,0.0,0.0,9918130.0
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.75,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4116.25
50%,47.5,360.0,0.0,479.0,0.0,0.0,0.0,0.0,0.0,10346.0,...,0.0,0.0,0.0,0.0,29.5,204.0,0.0,0.0,0.0,43261.0
75%,72.5,1056.25,0.0,93979.25,0.0,0.0,0.0,0.0,642.25,362390.2,...,0.0,0.0,0.0,0.0,6738.75,18569.5,0.0,0.0,0.0,290077.2
max,80.0,4240.0,0.0,4609687.0,862049.0,63148.0,0.0,0.0,2687760.0,19676540.0,...,0.0,199825.0,0.0,0.0,1023197.0,673723.0,0.0,0.0,0.0,48766020.0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, mojombo to ry
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Username       24 non-null     int64
 1   Id             24 non-null     int64
 2   Contributions  24 non-null     int64
 3   JavaScript     24 non-null     int64
 4   Python         24 non-null     int64
 5   Java           24 non-null     int64
 6   C#             24 non-null     int64
 7   PHP            24 non-null     int64
 8   TypeScript     24 non-null     int64
 9   Ruby           24 non-null     int64
 10  C++            24 non-null     int64
 11  C              24 non-null     int64
 12  Swift          24 non-null     int64
 13  Go             24 non-null     int64
 14  Shell          24 non-null     int64
 15  Kotlin         24 non-null     int64
 16  Rust           24 non-null     int64
 17  PowerShell     24 non-null     int64
 18  Objective-C    24 non-null     int64
 19  R        

## Transform Data

#### 1. Make the bytes into percentages

In [25]:
col = ["Id", "Contributions", "JavaScript", "Python", "Java", "C#", "PHP", "TypeScript", "Ruby", "C++", "C", "Swift", "Go", "Shell", "Kotlin", "Rust", "PowerShell", "Objective-C", "R", "MATLAB", "Dart", "Vue", "Assembly", "Sass", "CSS", "HTML", "Pascal", "Racket", "Zig", "Other"]
tcols = [x + '-T' for x in col]
def turn_to_percent(X, columns):
    X[columns] = X[columns].div(X[columns].sum(axis=1), axis=0)
    return X

# Create a FunctionTransformer using the defined function and pass the subset_columns argument
transformer = FunctionTransformer(turn_to_percent, validate=False, kw_args={'columns': col[2:]})

# Apply the transformation to your dataset
data = transformer.transform(data)
data

Unnamed: 0,Username,Id,Contributions,JavaScript,Python,Java,C#,PHP,TypeScript,Ruby,...,Dart,Vue,Assembly,Sass,CSS,HTML,Pascal,Racket,Zig,Other
mojombo,1,79,0,0.343371,0.0,0.0,0.0,0.0,0.447185,0.161792,...,0.0,0.0,0.0,0.0,0.001352,0.00304,0.0,0.0,0.0,0.041853
defunkt,2,1,0,0.7188,0.0,0.0,0.0,0.0,0.0,0.073304,...,0.0,0.0,0.0,0.0,0.000998,0.003733,0.0,0.0,0.0,0.13978
wycats,4,358,0,0.516904,0.0,0.0,0.0,0.0,0.41126,0.000146,...,0.0,0.0,0.0,0.0,0.003311,0.067557,0.0,0.0,0.0,0.000468
brynary,19,3473,0,0.011735,0.0,0.0,0.0,0.0,0.0,0.957798,...,0.0,0.0,0.0,0.0,0.0,0.018068,0.0,0.0,0.0,0.003162
kevinclark,20,18,0,0.0,0.0,0.0,0.0,0.0,0.0,0.227816,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045699
technoweenie,21,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.127858,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014873
macournoyer,22,366,0,0.000123,0.0,0.052369,0.0,0.0,0.0,0.795518,...,0.0,0.0,0.0,0.0,0.001833,0.016931,0.0,0.0,0.0,0.027712
caged,25,417,0,0.573919,0.0,0.0,0.0,0.0,0.019111,0.013859,...,0.0,0.0,0.0,0.0,0.000439,0.003035,0.0,0.0,0.0,0.361131
topfunky,26,420,0,0.0,0.0,0.0,0.0,0.0,0.0,0.698668,...,0.0,0.0,0.0,0.0,0.0,0.188127,0.0,0.0,0.0,0.001471
lukas,29,4,0,0.008391,0.183526,0.0,0.0,0.0,0.0,0.024964,...,0.0,0.0,0.0,0.0,0.000249,0.006035,0.0,0.0,0.0,0.774049


#### 2. Standardize the data using a StandardScaler

In [26]:
scaler = StandardScaler()

data[tcols] = scaler.fit_transform(data[col])
data.head()

Unnamed: 0,Username,Id,Contributions,JavaScript,Python,Java,C#,PHP,TypeScript,Ruby,...,Dart-T,Vue-T,Assembly-T,Sass-T,CSS-T,HTML-T,Pascal-T,Racket-T,Zig-T,Other-T
mojombo,1,79,0,0.343371,0.0,0.0,0.0,0.0,0.447185,0.161792,...,0.0,-0.208514,0.0,0.0,-0.230628,-0.370226,0.0,0.0,0.0,-0.634826
defunkt,2,1,0,0.7188,0.0,0.0,0.0,0.0,0.0,0.073304,...,0.0,-0.208514,0.0,0.0,-0.234568,-0.352541,0.0,0.0,0.0,-0.364786
wycats,4,358,0,0.516904,0.0,0.0,0.0,0.0,0.41126,0.000146,...,0.0,-0.208514,0.0,0.0,-0.208829,1.275851,0.0,0.0,0.0,-0.748948
brynary,19,3473,0,0.011735,0.0,0.0,0.0,0.0,0.0,0.957798,...,0.0,-0.208514,0.0,0.0,-0.245669,0.013208,0.0,0.0,0.0,-0.74152
kevinclark,20,18,0,0.0,0.0,0.0,0.0,0.0,0.0,0.227816,...,0.0,-0.208514,0.0,0.0,-0.245669,-0.447777,0.0,0.0,0.0,-0.624221


#### 3. Determine Number of Clusters

In [27]:
def optimize_kmeans(data, max_k):
    inertia = []
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertia.append(kmeans.inertia_)
    
    # generate elbow
    fig = plt.figure(figsize=(10, 5))
    plt.plot(range(1, max_k), inertia)
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Elbow Method')
    plt.show()

In [28]:
optimize_kmeans(data[tcols], 20)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

ValueError: n_samples=24 should be >= n_clusters=25.

#### 4 Apply K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(data[tcols])
data['cluster'] = kmeans.labels_
data.head()

ValueError: n_samples=13 should be >= n_clusters=20.

## Write to File

In [None]:
with open('../data/kmeansmodel.pkl', 'wb') as f:
    pickle.dump(kmeans, f)