In [2]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
import seaborn as sns

In [5]:
#Read data
df_clean = pd.read_csv('overall.csv')

In [6]:
#Split data
X = df_clean['selftext']
y = df_clean['Python']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y)

In [7]:
#Applying CountVectorizer
cvec = CountVectorizer(min_df=5)
cvec.fit(X_train)
X_train_cvec = cvec.transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [12]:
#Get the length
len(cvec.get_feature_names())

6230

In [14]:
#Putting data into dataframe
X_train_cv = pd.DataFrame(X_train_cvec.toarray(), columns=cvec.get_feature_names())
X_test_cv = pd.DataFrame(X_test_cvec.toarray(), columns=cvec.get_feature_names())
X_train_cv.head()

Unnamed: 0,00,000,0000,00000,0001,0002,001,00103,002,005,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#drop numeric columns
X_train_cv = X_train_cv.drop(columns=[col for col in X_train_cv.columns if col.isnumeric()])
X_train_cv.head()

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#drop numeric columns
X_test_cv = X_test_cv.drop(columns=[col for col in X_test_cv.columns if col.isnumeric()])
X_test_cv.head()

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#Check if there is 'python'
X_train_cv['python']

0       0
1       0
2       0
3       0
4       6
       ..
7495    0
7496    0
7497    1
7498    0
7499    0
Name: python, Length: 7500, dtype: int64

In [22]:
#Drop 'python'
X_train_cv = X_train_cv.drop(columns = ['python'])

In [23]:
X_train_cv.head()

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
#Drop sql
X_train_cv = X_train_cv.drop(columns = ['sql'])

In [25]:
X_train_cv.head()

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#drop python and sql
X_test_cv = X_test_cv.drop(columns = ['python'])
X_test_cv = X_test_cv.drop(columns = ['sql'])

In [27]:
X_test_cv.head()

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Baseline Accuracy

In [28]:
#Checking for baseline accuracy
y_train.value_counts(normalize=True)

0    0.5
1    0.5
Name: Python, dtype: float64

# Logistic Regression

In [29]:
lr = LogisticRegression(penalty='l2')

lr.fit(X_train_cv, y_train)
lr.score(X_train_cv, y_train), lr.score(X_test_cv, y_test)

(0.9972, 0.9588)

In [30]:
#Overfit

In [31]:
print(f'Logistic Regression Intercept: {lr.intercept_}')
print(f'Logistic Regression Coefficient: {lr.coef_}')

Logistic Regression Intercept: [0.60716558]
Logistic Regression Coefficient: [[ 5.24856623e-04 -8.47553855e-05  1.69783538e-02 ... -6.18877737e-04
  -4.66934769e-04  1.38780948e-01]]


In [32]:
#How many coefficients are there
lr.coef_.shape

(1, 5989)

In [33]:
X_train_cv

Unnamed: 0,0rc1,100k,10gb,10k,10x,11g,15k,1nf,1st,2008r2,...,youtube,yt,yy,yyyy,zero,zeros,zip,zipcode,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7496,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [34]:
# Check out coefficients
coef = pd.Series(lr.coef_[0], index = X_train_cv.columns)
coef

0rc1       0.000525
100k      -0.000085
10gb       0.016978
10k       -0.024008
10x       -0.097695
             ...   
zeros      0.040324
zip       -0.315919
zipcode   -0.000619
zone      -0.000467
zoom       0.138781
Length: 5989, dtype: float64

In [35]:
# Check out coefficients
coef1 = pd.Series(np.exp(lr.coef_[0]), index = X_train_cv.columns)
coef1

0rc1       1.000525
100k       0.999915
10gb       1.017123
10k        0.976278
10x        0.906926
             ...   
zeros      1.041148
zip        0.729119
zipcode    0.999381
zone       0.999533
zoom       1.148872
Length: 5989, dtype: float64

In [51]:
coefficients = pd.DataFrame(data=coef1, index=None, columns=None, dtype=None, copy=None)
coefficients

Unnamed: 0,0
0rc1,1.000525
100k,0.999915
10gb,1.017123
10k,0.976278
10x,0.906926
...,...
zeros,1.041148
zip,0.729119
zipcode,0.999381
zone,0.999533


In [52]:
coefficients.rename(columns={0:'Coefficients'}, inplace=True)


In [53]:
coefficients.sort_values(by = 'Coefficients', ascending = False)

Unnamed: 0,Coefficients
https,7.919949
x200b,5.233946
github,4.656340
api,3.966020
thread,3.618439
...,...
queries,0.194635
tables,0.169283
query,0.159342
database,0.155438


# Random Forests and Extra Trees

In [38]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [39]:
rf = RandomForestClassifier(n_estimators=10)

et = ExtraTreesClassifier(n_estimators=10)

In [40]:
#Score for RandomForestClassifier
cross_val_score(rf, X_train_cv, y_train, cv=5).mean()

0.9158666666666667

In [41]:
#Score for ExtraTreesClassifier
cross_val_score(et, X_train_cv, y_train, cv=5).mean()

0.9052

In [42]:
rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators': [10, 20, 50, 100],
    'max_depth': [None, 3, 4, 5],
    'max_features': ['auto', 100, 200, 300, 4, 5, 50]
}

gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train_cv, y_train)
print(gs.best_score_)
gs.best_params_

0.9409333333333333


{'max_depth': None, 'max_features': 50, 'n_estimators': 100}

In [43]:
gs.score(X_test_cv, y_test)

0.9452

In [44]:
et = ExtraTreesClassifier(random_state=42)

et_params = {
    'n_estimators': [10, 20, 50, 100],
    'max_depth': [None, 3, 4, 5],
    'max_features': ['auto', 100, 200, 300, 4, 5, 50]
}

gs = GridSearchCV(et, param_grid=et_params, cv=5)
gs.fit(X_train_cv, y_train)
print(gs.best_score_)
gs.best_params_

0.9405333333333334


{'max_depth': None, 'max_features': 300, 'n_estimators': 100}

In [45]:
gs.score(X_test_cv, y_test)

0.9456

# Support Vector Machines


In [46]:
from sklearn import svm

In [47]:
svc = svm.SVC()

svc_params = {
    'kernel': ['rbf','linear','poly','sigmoid'],
    'C': [1.0, 0.5, 2.0, 5.0]
}

gs = GridSearchCV(svc, param_grid=svc_params, cv=5)
gs.fit(X_train_cv, y_train)
print(gs.best_score_)

gs.score(X_train_cv, y_train), gs.score(X_test_cv, y_test)

0.9414666666666666


(0.9792, 0.9484)

In [48]:
gs.best_params_

{'C': 5.0, 'kernel': 'rbf'}

In [49]:
gs.best_estimator_

SVC(C=5.0)