In [7]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import re
import sys
from scipy import sparse
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import pickle

In [3]:
e_features = pd.read_csv('extracted_features.csv', index_col=0)
df = pd.read_csv('prepped_data.csv', index_col=0)

In [4]:
X = df[['postText', 'targetTitle','targetDescription', 'targetKeywords']].join(e_features)
y = df['label']
X['postText'] = X['postText'].fillna('')
X['targetDescription'] = X['targetDescription'].fillna('')
X['targetKeywords'] = X['targetKeywords'].fillna('')


#split data to train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)


In [5]:
# downsampling nonCBt
def downsampling(X_train, y_train):
    n_cb = y_train.sum()
    n_ncb_to_drop = len(y_train)-n_cb-n_cb
    dropids = y_train[y_train==0].sample(n_ncb_to_drop,random_state = 100).index
    return X_train.drop(dropids), y_train.drop(dropids)

#X_train,y_train = downsampling(X_train,y_train)
    

# Bag of word vs Extracted features

In [6]:
# vectorize training text fields
tw_tokenizer = nltk.tokenize.TweetTokenizer()
tt_tokenizer = nltk.tokenize.TweetTokenizer()
td_tokenizer = nltk.tokenize.TweetTokenizer()
tk_tokenizer = nltk.tokenize.TweetTokenizer()
tw_vectorizer = CountVectorizer(lowercase=False,tokenizer=tw_tokenizer.tokenize, ngram_range= (1,5),analyzer= 'word')
tt_vectorizer = CountVectorizer(lowercase=False,tokenizer=tt_tokenizer.tokenize, ngram_range= (1,5),analyzer= 'word')
td_vectorizer = CountVectorizer(lowercase=False, tokenizer=td_tokenizer.tokenize, ngram_range = (1,4), analyzer = 'word')
tk_vectorizer = CountVectorizer(lowercase=False, tokenizer=tk_tokenizer.tokenize, ngram_range = (1,4), analyzer = 'word')
X_train_tdvec_bow = td_vectorizer.fit_transform(X_train['targetDescription'])
X_train_twvec_bow = tw_vectorizer.fit_transform(X_train['postText'])
X_train_ttvec_bow = tt_vectorizer.fit_transform(X_train['targetTitle'])
X_train_tkvec_bow = tk_vectorizer.fit_transform(X_train['targetKeywords'])

In [6]:
# vectorize testing text fields
X_test_twvec_bow = tw_vectorizer.transform(X_test['postText'])
X_test_tdvec_bow = td_vectorizer.transform(X_test['targetDescription'])
X_test_ttvec_bow = tt_vectorizer.transform(X_test['targetTitle'])
X_test_tkvec_bow = tk_vectorizer.transform(X_test['targetKeywords'])

In [7]:
# models
lr = LogisticRegression()
xgb = XGBClassifier(n_estimators=3000,gamma = 2, max_depth=1,n_jobs=8)
rf = RandomForestClassifier()
models =[ lr,xgb, rf]
names = ['Logistic Regression','XGB', 'Random Forest']
results ={}

In [10]:
# using just bag of words
results['bag_of_words']=  {}
for clf, name in zip(models,names):
    print('in CV', name)
    sub_X_train = sparse.hstack([X_train_tdvec_bow, X_train_ttvec_bow, X_train_twvec_bow,X_train_tkvec_bow])
    sub_X_test = sparse.hstack([X_test_tdvec_bow, X_test_ttvec_bow, X_test_twvec_bow,X_test_tkvec_bow])
    scores = cross_validate(clf, sub_X_train, y_train, cv= 3, scoring = ('precision', 'recall','f1','roc_auc'))
    results['bag_of_words'][name+'_cv_scores'] = {}
    results['bag_of_words'][name+'_cv_scores']['precision'] = scores['test_precision'].mean()
    results['bag_of_words'][name+'_cv_scores']['recall'] = scores['test_recall'].mean()
    results['bag_of_words'][name+'_cv_scores']['f1'] = scores['test_f1'].mean()
    results['bag_of_words'][name+'_cv_scores']['roc_auc'] = scores['test_roc_auc'].mean()
    
    print('training')
    clf.fit(sub_X_train,y_train)
    pred_prob = clf.predict_proba(sub_X_test)[:,1]
    pred = pred_prob>0.5
    results['bag_of_words'][name+'_test_scores'] = {}
    results['bag_of_words'][name+'_test_scores']['precision']=precision_score(y_test, pred)
    results['bag_of_words'][name+'_test_scores']['recall']=recall_score(y_test, pred)
    results['bag_of_words'][name+'_test_scores']['f1'] = f1_score(y_test,pred)
    results['bag_of_words'][name+'_test_scores']['roc_auc'] = roc_auc_score(y_test, pred_prob)
    
    fp = []
    fn = []
    for i in range(len(pred)):
        fn.append(False)
        fp.append(False)
        if pred[i]==1 and  y_test.values[i]==0:
            fp[-1] = True
        if pred[i]==0 and y_test.values[i] ==1:
            fn[-1] = True
    
    results['bag_of_words'][name+'_test_scores']['fn'] =X_test.loc[fn,:].index
    results['bag_of_words'][name+'_test_scores']['fp'] = X_test.loc[fp,:].index
    
    print('\n===============================================')

in CV Logistic Regression
training

in CV XGB


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


training

in CV Random Forest
training



In [11]:
# using just extracted features
results['extracted_features'] = {}
for clf, name in zip(models,names):
    
    print('in CV', name)
    scores = cross_validate(clf, X_train.iloc[:, 4:],y_train, cv= 3, scoring = ('precision', 'recall','f1','roc_auc'))
    results['extracted_features'][name+'_cv_scores'] = {}
    results['extracted_features'][name+'_cv_scores']['precision'] = scores['test_precision'].mean()
    results['extracted_features'][name+'_cv_scores']['recall'] = scores['test_recall'].mean()
    results['extracted_features'][name+'_cv_scores']['f1'] = scores['test_f1'].mean()
    results['extracted_features'][name+'_cv_scores']['roc_auc'] = scores['test_roc_auc'].mean()
    
    print('training')
    clf.fit(X_train.iloc[:, 4:],y_train)
    pred_prob = clf.predict_proba(X_test.iloc[:,4:])[:,1]
    pred = pred_prob>0.5
    results['extracted_features'][name+'_test_scores'] = {}
    results['extracted_features'][name+'_test_scores']['precision']=precision_score(y_test, pred)
    results['extracted_features'][name+'_test_scores']['recall']=recall_score(y_test, pred)
    results['extracted_features'][name+'_test_scores']['f1'] = f1_score(y_test,pred)
    results['extracted_features'][name+'_test_scores']['roc_auc'] = roc_auc_score(y_test, pred_prob)
    
    fp = []
    fn = []
    for i in range(len(pred)):
        fn.append(False)
        fp.append(False)
        if pred[i]==1 and  y_test.values[i]==0:
            fp[-1] = True
        if pred[i]==0 and y_test.values[i] ==1:
            fn[-1] = True
    
    results['extracted_features'][name+'_test_scores']['fn'] =X_test.loc[fn,:].index
    results['extracted_features'][name+'_test_scores']['fp'] = X_test.loc[fp,:].index
    print('\n===============================================')

in CV Logistic Regression
training

in CV XGB


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


training

in CV Random Forest
training



In [12]:
results['combined'] ={} 

# using comnbined features
for clf, name in zip(models,names):
    print('in CV', name)
    sub_X_train = sparse.hstack([X_train_tdvec_bow, X_train_ttvec_bow, X_train_twvec_bow,X_train_tkvec_bow ,X_train.iloc[:,4:].values])
    sub_X_test = sparse.hstack([X_test_tdvec_bow, X_test_ttvec_bow, X_test_twvec_bow,X_test_tkvec_bow,X_test.iloc[:,4:].values])
    scores = cross_validate(clf, sub_X_train, y_train, cv= 3, scoring = ('precision', 'recall','f1','roc_auc'))
    results['combined'][name+'_cv_scores'] = {}
    results['combined'][name+'_cv_scores']['precision'] = scores['test_precision'].mean()
    results['combined'][name+'_cv_scores']['recall'] = scores['test_recall'].mean()
    results['combined'][name+'_cv_scores']['f1'] = scores['test_f1'].mean()
    results['combined'][name+'_cv_scores']['roc_auc'] = scores['test_roc_auc'].mean()
    
    print('training')
    clf.fit(sub_X_train,y_train)
    pred_prob = clf.predict_proba(sub_X_test)[:,1]
    pred = pred_prob>0.5
    results['combined'][name+'_test_scores']= {}
    results['combined'][name+'_test_scores']['precision']=precision_score(y_test, pred)
    results['combined'][name+'_test_scores']['recall']=recall_score(y_test, pred)
    results['combined'][name+'_test_scores']['f1'] = f1_score(y_test,pred)
    results['combined'][name+'_test_scores']['roc_auc'] = roc_auc_score(y_test, pred_prob)
    
    fp = []
    fn = []
    for i in range(len(pred)):
        fn.append(False)
        fp.append(False)
        if pred[i]==1 and  y_test.values[i]==0:
            fp[-1] = True
        if pred[i]==0 and y_test.values[i] ==1:
            fn[-1] = True
    
    results['combined'][name+'_test_scores']['fn'] =X_test.loc[fn,:].index
    results['combined'][name+'_test_scores']['fp'] = X_test.loc[fp,:].index
    print('\n===============================================')

in CV Logistic Regression
training

in CV XGB


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


training

in CV Random Forest
training



In [13]:
results

{'bag_of_words': {'Logistic Regression_cv_scores': {'precision': 0.7406293520564691,
   'recall': 0.44140625,
   'f1': 0.5531165570739276,
   'roc_auc': 0.8359451865988126},
  'Logistic Regression_test_scores': {'precision': 0.7339901477832512,
   'recall': 0.48534201954397393,
   'f1': 0.5843137254901961,
   'roc_auc': 0.8569147085797413,
   'fn': Int64Index([806030450594578432, 831913533084405760, 817732714107437059,
               856192826241110017, 830778487640240128, 821026728667533313,
               817324408796835842, 841037379326496768, 820621811780034561,
               852845152007802881,
               ...
               837457471425953792, 845772302839091201, 841145043863908353,
               842145373036597248, 834904173728661504, 841112184264523777,
               827893794070724608, 842057380162613252, 814713884057681920,
               838933321740660736],
              dtype='int64', name='id', length=474),
   'fp': Int64Index([846919968126066690, 805378641198809088

In [14]:
import pickle
with open('results','wb') as f:
    pickle.dump(results,f)