In [None]:
import numpy as np 
import pandas as pd 
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
#from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import eli5
from eli5.sklearn import PermutationImportance
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots

In [None]:
# the number of new users in each year
client = bigquery.Client()
query = '''
        SELECT extract (year from creation_date) as year,
               count(id) as n_new_user
        FROM `bigquery-public-data.stackoverflow.users` 
        GROUP BY year
        ORDER BY year
        '''
query_job = client.query(query)
new_user = query_job.to_dataframe()      

# the number of users who have been inactive for at least one year,with the last access in 2019 or before
query = '''
        SELECT extract(year from last_access_date) as year,
               count(id) as n_inact_user
        FROM `bigquery-public-data.stackoverflow.users` 
        GROUP BY year
        HAVING year < 2020
        ORDER BY year
        '''
query_job = client.query(query)
inact_user = query_job.to_dataframe()    

In [None]:
# increasing inactive users relative to new users 
df = pd.merge(new_user,inact_user,on='year',how='inner')
df.set_index('year')[['n_new_user','n_inact_user']].plot()

In [None]:
# To predict when inactive users become inactive (whether become inactive in one year), foucs on users:
# 1) who have been inactive for at least one year, with the last access in 2019 or before
# 2) who started early and thus have a long history. Users who start in 2008 are a small sample. So choose users starting in 2009
# Focus on users behaviors of posting and answering questions and making comments, because these are the main behaviors and available for different time points
# The time when a user becomes inactive in terms of these behaviors could be same or different from her last access date
# becuase last access date may represent behaviors that are not available at different time points, such as viewing or voting

In [None]:
# each user's question posting and getting feedback from other users in each year
query = '''
        WITH a AS
        (SELECT owner_user_id,
                EXTRACT(year from creation_date) as year, 
                count(id) as n_question, 
                count(accepted_answer_id) as n_accepted_answer, 
                avg(answer_count) as mean_answer_count,
                avg(comment_count) as mean_comment_count,
                avg(favorite_count) as mean_favorite_count,
                avg(view_count) as mean_view_count,
                avg(score) as mean_score
         FROM `bigquery-public-data.stackoverflow.posts_questions`
         GROUP BY owner_user_id,year 
        )
        SELECT a.owner_user_id, a.year,a.n_question,
               a.n_accepted_answer,a.mean_answer_count,a.mean_comment_count,
               a.mean_favorite_count,a.mean_view_count,a.mean_score,
               EXTRACT(year from creation_date) as start_year,
               EXTRACT(year from last_access_date) as quit_year
        FROM `bigquery-public-data.stackoverflow.users` AS b
        INNER JOIN a
        ON a.owner_user_id = b.id
        WHERE EXTRACT(year from b.creation_date) = 2009 and \
        EXTRACT(year from b.last_access_date) < 2020
        ORDER BY a.owner_user_id,a.year
        '''
query_job = client.query(query)
question_feedback_start_2009 = query_job.to_dataframe() 

In [None]:
# each user's answering questions posted by others in each year
query = '''
        WITH a AS
        (SELECT owner_user_id,
                EXTRACT(year from creation_date) as year, 
                count(id) as n_answer
         FROM `bigquery-public-data.stackoverflow.posts_answers`
         GROUP BY owner_user_id,year 
        )
        SELECT a.owner_user_id, a.year,a.n_answer,
               EXTRACT(year from creation_date) as start_year,
               EXTRACT(year from last_access_date) as quit_year
        FROM `bigquery-public-data.stackoverflow.users` AS b
        INNER JOIN a
        ON a.owner_user_id = b.id
        WHERE EXTRACT(year from b.creation_date) = 2009 and \
        EXTRACT(year from b.last_access_date) < 2020
        ORDER BY a.owner_user_id,a.year
        '''
query_job = client.query(query)
answer_start_2009 = query_job.to_dataframe()

In [None]:
# each user's comment making in each year
query = '''
        WITH a AS
        (SELECT user_id as owner_user_id,
                EXTRACT(year from creation_date) as year, 
                count(id) as n_comment 
         FROM `bigquery-public-data.stackoverflow.comments`
         GROUP BY owner_user_id,year 
        )
        SELECT a.owner_user_id, a.year,a.n_comment,
               EXTRACT(year from creation_date) as start_year,
               EXTRACT(year from last_access_date) as quit_year
        FROM `bigquery-public-data.stackoverflow.users` AS b
        INNER JOIN a
        ON a.owner_user_id = b.id 
        WHERE EXTRACT(year from b.creation_date) = 2009 and \
        EXTRACT(year from b.last_access_date) < 2020
        ORDER BY a.owner_user_id,a.year
        '''
query_job = client.query(query)
comment_start_2009 = query_job.to_dataframe() 

In [None]:
# merge questions and feedback, answering, and commenting
start_2009 = pd.merge(answer_start_2009,comment_start_2009,on=['owner_user_id','year'],how='outer')
start_2009 = pd.merge(start_2009,question_feedback_start_2009,on=['owner_user_id','year'],how='outer')

In [None]:
# generate target and features
start_2009['last_before_quit'] = (start_2009['year'] == \
start_2009.groupby('owner_user_id')['year'].transform('max')).astype(int)
start_2009.fillna(0,inplace=True)

start_2009['mean_accepted_answer'] = start_2009['n_accepted_answer']/start_2009['n_question']

start_2009 = start_2009.sort_values(['owner_user_id','year'])
for var in ['n_comment','n_question','n_answer','mean_accepted_answer',\
        'mean_answer_count','mean_comment_count','mean_favorite_count',\
        'mean_view_count','mean_score']:
    start_2009['l_'+var] = start_2009.groupby('owner_user_id')[var].shift(1)

In [None]:
# drop obs. with missing values and drop those users with only one obs.
start_2009_copy = start_2009.dropna()
start_2009_copy['n_obs'] = start_2009_copy.groupby('owner_user_id')['year'].transform('count')
start_2009_copy = start_2009_copy[start_2009_copy.n_obs>1]

In [None]:
# mild imbalance: 20% minority
start_2009_copy['last_before_quit'].value_counts()

In [None]:
# validation
X = start_2009_copy[['l_n_comment','l_n_question','l_n_answer','l_mean_accepted_answer',\
        'l_mean_answer_count','l_mean_comment_count','l_mean_favorite_count',\
        'l_mean_view_count','l_mean_score']]
y= start_2009_copy['last_before_quit']

#train_X, val_X, train_y, val_y = train_test_split(X,y,
 #                                                 stratify=y,
  #                                                random_state=0)

In [None]:
# scaling not necessary for tree-based algorithms
X.describe()

In [None]:
y.value_counts()

In [None]:
# class ratio
class_ratio = len(y[y==0])/len(y[y==1])

In [None]:
# XGB: (imbalanced) class weight,
model = XGBClassifier(objective='binary:logistic',max_depth=10,
                      n_estimators=100,learning_rate=0.1,
                     scale_pos_weight=4) 
model.fit(train_X,train_y)
preds = model.predict(val_X)
accuracy = metrics.accuracy_score(val_y,preds)
auc = metrics.roc_auc_score(val_y,preds)
#confusion = metrics.confusion_matrix(val_y,preds)
precision = metrics.precision_score(val_y,preds)
recall = metrics.recall_score(val_y,preds)
print('accuracy:',accuracy,
     'auc:',auc,
     #'confusion:',confusion,
     'precision:',precision,
     'recall:',recall
     )

In [None]:
# XGB: 
# basis: 0.79, 0.69, 0.52, 0.06
# class weight: 0.62, 0.69, 0.31, 0.66
# random oversample but no class weight: 0.64, 0.69, 0.32, 0.62
# smote but no class weight: 0.78, 0.67, 0.46, 0.11
# undersample and upweight: 0.78, 0.68, 0.43, 0.1

In [None]:
# XGB: already stratified
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     early_stopping_rounds=5) 
scores = cross_validate(model,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print("average test accuracy:",scores['test_accuracy'].mean(),'\n',
      "average test roc_auc:", scores['test_roc_auc'].mean(),'\n',
      "average test precision:",scores['test_precision'].mean(),'\n',
      "average test recall:", scores['test_recall'].mean()
     )

In [None]:
# XGB: class weight, already stratified
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     scale_pos_weight=class_ratio,
                     early_stopping_rounds=5,) 
scores = cross_validate(model,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print("average test accuracy:",scores['test_accuracy'].mean(),'\n',
      "average test roc_auc:", scores['test_roc_auc'].mean(),'\n',
      "average test precision:",scores['test_precision'].mean(),'\n',
      "average test recall:", scores['test_recall'].mean()
     )

In [None]:
# XGB: no class weight but randomoversample, already stratified
random = RandomOverSampler(random_state=9)
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     #scale_pos_weight=4,
                     early_stopping_rounds=5,) 
pipeline = Pipeline(
                    [('over_sampling',random),
                     ('model',model)]   
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# XGB: smote but not class weight, already stratified
# why does smote perform much worse than random in recall?
smote =SMOTE(random_state=9)
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     #scale_pos_weight=4,
                     early_stopping_rounds=5,) 
pipeline = Pipeline(
                    [('smote',smote),
                     ('model',model)]   
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# XGB: adasyn but not class weight, already stratified, 
adasyn = ADASYN(random_state=9)
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     #scale_pos_weight=4,
                     early_stopping_rounds=5,) 
pipeline = Pipeline(
                    [('over_sampling',adasyn),
                     ('model',model)]   
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# XGB: downsampling and upweight(class weight), already stratified
random = RandomUnderSampler(sampling_strategy='majority', random_state=9)
model = XGBClassifier(objective='binary:logistic',max_depth=6,
                      n_estimators=100,learning_rate=0.1,
                     scale_pos_weight=1/3.75,
                     #early_stopping_rounds=5,
                     ) 
pipeline = Pipeline(
                    [('under_sampling',random),
                     ('model',model)]   
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                       'precision','recall'))
# cross_validate cv, stratified kfold is used for classifier model
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# random forest:
# basis: 0.79, 0.69, 0.60, 0.03
# class weight: 0.64, 0.69, 0.32,0.64 (balanced and major/minor ratio same result)
# oversampling: 0.63,0.69,0.32,0.65
# smote: 0.71,0.67,0.35,0.4
# undersampling: similar to oversampling
# undersampling and upweight: similar to basis

In [None]:
# randomforest, stratified cv
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              #class_weight={0:1,1:4}
                              )
scores = cross_validate(model,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, class weight, stratified cv
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              class_weight={0:1,1:3.75})
scores = cross_validate(model,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, class weight balanced, stratified cv
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              class_weight='balanced')
scores = cross_validate(model,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, oversample but no class weight, stratified cv
random_over = RandomOverSampler(random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              #class_weight='balanced'
                              )
pipeline = Pipeline(
                   [('over',random_over),
                   ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, smote but no class weight, stratified cv
smote = SMOTE(random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              #class_weight='balanced'
                              )
pipeline = Pipeline(
                   [('over',smote),
                   ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, undersampling but no class weight, stratified cv
under = RandomUnderSampler(sampling_strategy='majority', random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              #class_weight='balanced'
                              )
pipeline = Pipeline(
                   [('under',under),
                   ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# randomforest, undersampling and upweight, stratified cv
under = RandomUnderSampler(sampling_strategy='majority', random_state=0)
model = RandomForestClassifier(n_estimators=100,max_depth=6,
                              class_weight={0:3.75,1:1}
                              )
pipeline = Pipeline(
                   [('under',under),
                   ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                                'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit: 
# basis: 0.78, 0.67, 0, 0
# classweight: 0.61, 0.65, 0.30, 0.60
# random oversample but no weight: 0.61, 0.67, 0.31, 0.65
# smote but no weight: 0.61, 0.68, 0.31, 0.67
# undersample and upweight: 0.78, 0.68, 0.5, 0.02

In [None]:
# logit: stratify, no classweight
scaler = StandardScaler()
model = LogisticRegression()
pipeline = Pipeline(
                   [('scaler',scaler),
                   ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit: stratify, classweight
scaler = StandardScaler()
model = LogisticRegression(class_weight={0:1,1:3.75})
pipeline = Pipeline(
                   [('scaler',scaler),
                    ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit: stratify, classweight balanced
scaler = StandardScaler()
model = LogisticRegression(class_weight='balanced')
pipeline = Pipeline(
                    [('scaler',scaler),
                    ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit, scaling, oversampling and no classweight
scaler = StandardScaler()
random = RandomOverSampler(random_state=9)
model = LogisticRegression()
pipeline = Pipeline([('scaling',scaler),
                     ('oversampling',random),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit, scaling, oversampling and no classweight
scaler = StandardScaler()
smote = SMOTE(random_state=9)
model = LogisticRegression()
pipeline = Pipeline([('scaling',scaler),
                     ('smote',smote),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# logit, scaling, undersampling and upweight classweight
scaler = StandardScaler()
random = RandomUnderSampler(sampling_strategy='majority', random_state=9)
model = LogisticRegression(class_weight={0:3.75,1:1})
pipeline = Pipeline([('scaling',scaler),
                     ('undersampling',random),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# KNN: needs scaler, but not class weight
# basis: 0.75,0.60,0.34,0.17
# oversampling, smote, undersampling: 0.59,0.59,0.27,0.53

In [None]:
# knn: scaler, stratefied
scaler = StandardScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(
                    [('scaling',scaler),
                    ('model',model)]
                   )
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# knn random oversampling
scaler = StandardScaler()
model = KNeighborsClassifier()
random_over = RandomOverSampler(random_state=0)
pipeline = Pipeline([('scaling',scaler),
                     ('over',random_over),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# knn smote
scaler = StandardScaler()
model = KNeighborsClassifier()
smote = SMOTE(random_state=0)
pipeline = Pipeline([('scaling',scaler),
                     ('over',smote),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# knn undersampling
scaler = StandardScaler()
model = KNeighborsClassifier()
random_under = RandomUnderSampler(sampling_strategy='majority',
                                  random_state=0)
pipeline = Pipeline([('scaling',scaler),
                     ('under',random_under),
                    ('model',model)])
scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
print(scores['test_accuracy'],scores['test_accuracy'].mean(),
      scores['test_roc_auc'],scores['test_roc_auc'].mean(),
      scores['test_precision'],scores['test_precision'].mean(),
      scores['test_recall'],scores['test_recall'].mean()
     )

In [None]:
# knn , tune n
accuracy = []
roc_auc = []
precision = []
recall = []
for i in range(1,20,1):
    scaler = StandardScaler()
    model = KNeighborsClassifier(n_neighbors=i)
    random_under = RandomUnderSampler(sampling_strategy='majority',
                                  random_state=0)
    pipeline = Pipeline([('scaling',scaler),
                     ('under',random_under),
                    ('model',model)])
    scores = cross_validate(pipeline,X,y,cv=5,scoring=('accuracy','roc_auc',
                                               'precision','recall'))
    accuracy.append(scores['test_accuracy'].mean())
    roc_auc.append(scores['test_roc_auc'].mean())
    precision.append(scores['test_precision'].mean())
    recall.append(scores['test_recall'].mean())
print(accuracy,roc_auc,precision,recall)

In [None]:
plt.plot(recall)

In [None]:
plt.plot(precision)

In [None]:
# permutation importance
perm = PermutationImportance(model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

In [None]:
# partial dependence plot
features = ['l_n_comment','l_n_question','l_n_answer','l_mean_accepted_answer',\
        'l_mean_answer_count','l_mean_comment_count','l_mean_favorite_count',\
        'l_mean_view_count','l_mean_score']
for f in features:
    pdp_goals = pdp.pdp_isolate(model, dataset=val_X,model_features=features,\
                                feature=f)
    pdp.pdp_plot(pdp_goals,f)
    plt.show()