#### Load Data


In [1]:
import pickle
import numpy as np
import pandas as pd

#### Tfidf & Bag-of-Words Using spaCy

In [2]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string

In [3]:
df_cols = np.load('data/df_cols.npy')

X_test = pd.DataFrame(np.load('data/X_test.npy'), columns=df_cols)
X_train = pd.DataFrame(np.load('data/X_train.npy'), columns=df_cols)
X_val = pd.DataFrame(np.load('data/X_val.npy'), columns=df_cols)

y_test = np.load('data/y_test.npy')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')

X_train.shape, X_test.shape, X_val.shape

((53480, 50), (18183, 50), (17471, 50))

##### XGBoost Classifier - bof

In [4]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
from xgboost import XGBClassifier
vectorizer = CountVectorizer(min_df= 5,  ngram_range=(1,1))
classifier = XGBClassifier(learning_rate=1, max_depth=40)

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))
test = list(zip(X_test['Text_clean'], y_test))
            
# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

  if diff:


Validation accuracy score:  0.6210291339934749
Training accuracy score:  0.9961667913238594


  if diff:


In [5]:
# Save proba:
xgb_bow_train_proba = pipe.predict_proba([x[0] for x in train])
xgb_bow_val_proba = pipe.predict_proba([x[0] for x in val])
xgb_bow_test_proba = pipe.predict_proba([x[0] for x in test])

In [6]:
with open('xgb_bow_train_proba.pkl', 'wb') as f:
    pickle.dump(xgb_bow_train_proba, f)
    
with open('xgb_bow_val_proba.pkl', 'wb') as f:
    pickle.dump(xgb_bow_val_proba, f)
    
with open('xgb_bow_test_proba.pkl', 'wb') as f:
    pickle.dump(xgb_bow_test_proba, f)

#### XGBOOST - tfidf

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer 

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = TfidfVectorizer(min_df= 5,  ngram_range=(1,1))
classifier = XGBClassifier(learning_rate=1, max_depth=40)

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))
test = list(zip(X_test['Text_clean'], y_test))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

  if diff:


Validation accuracy score:  0.6232041669051571
Training accuracy score:  0.9961854899027673


  if diff:


In [8]:
# Save proba:
xgb_tfidf_train_proba = pipe.predict_proba([x[0] for x in train])
xgb_tfidf_val_proba = pipe.predict_proba([x[0] for x in val])
xgb_tfidf_test_proba = pipe.predict_proba([x[0] for x in test])

In [9]:
with open('xgb_tfidf_train_proba.pkl', 'wb') as f:
    pickle.dump(xgb_tfidf_train_proba, f)
    
with open('xgb_tfidf_val_proba.pkl', 'wb') as f:
    pickle.dump(xgb_tfidf_val_proba, f)
    
with open('xgb_tfidf_test_proba.pkl', 'wb') as f:
    pickle.dump(xgb_tfidf_test_proba, f)