#### Load Data


In [9]:

import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string

In [4]:
df_cols = np.load('data/df_cols.npy')

X_test = pd.DataFrame(np.load('data/X_test.npy'), columns=df_cols)
X_train = pd.DataFrame(np.load('data/X_train.npy'), columns=df_cols)
X_val = pd.DataFrame(np.load('data/X_val.npy'), columns=df_cols)

y_test = np.load('data/y_test.npy')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')


X_train.shape, X_test.shape, X_val.shape

((53480, 50), (18183, 50), (17471, 50))

##### Random Forest Classifier - Tfidf

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer 

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = TfidfVectorizer(min_df= 5, ngram_range=(1,1))
classifier = RandomForestClassifier()

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))
test = list(zip(X_test['Text_clean'], y_test))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

Validation accuracy score:  0.6212008471180814
Training accuracy score:  0.9902393418100225


In [19]:
# Save proba:
rfc_tfidf_train_proba = pipe.predict_proba([x[0] for x in train])
rfc_tfidf_val_proba = pipe.predict_proba([x[0] for x in val])
rfc_tfidf_test_proba = pipe.predict_proba([x[0] for x in test])

with open('data/rfc_tfidf_train_proba.pkl', 'wb') as f:
    pickle.dump(rfc_tfidf_train_proba, f)
    
with open('data/rfc_tfidf_val_proba.pkl', 'wb') as f:
    pickle.dump(rfc_tfidf_val_proba, f)
    
with open('data/rfc_tfidf_test_proba.pkl', 'wb') as f:
    pickle.dump(rfc_tfidf_test_proba, f)

##### Random Forest Classifier - BOW

In [21]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(min_df= 5, ngram_range=(1,1))
classifier = RandomForestClassifier()

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))
test = list(zip(X_test['Text_clean'], y_test))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

Validation accuracy score:  0.6140461335928109
Training accuracy score:  0.9901832460732984


In [22]:
# Save proba:
rfc_bow_train_proba = pipe.predict_proba([x[0] for x in train])
rfc_bow_val_proba = pipe.predict_proba([x[0] for x in val])
rfc_bow_test_proba = pipe.predict_proba([x[0] for x in test])

with open('data/rfc_bow_train_proba.pkl', 'wb') as f:
    pickle.dump(rfc_bow_train_proba, f)
    
with open('data/rfc_bow_val_proba.pkl', 'wb') as f:
    pickle.dump(rfc_bow_val_proba, f)
    
with open('data/rfc_bow_test_proba.pkl', 'wb') as f:
    pickle.dump(rfc_bow_test_proba, f)

##### SVM Classifier

In [None]:
countvect =  TfidfVectorizer(stop_words='english') 
text_features =countvect.fit(X_train['Text']) # bag of words

text_feature_vec_train = text_features.transform(X_train['Text'])
text_feature_vec_test = text_features.transform(X_test['Text'])
text_feature_vec_val = text_features.transform(X_val['Text'])

clf = SVC(probability=True, gamma=0.33, C=10)

clf.fit(text_feature_vec_train, y_train)
rnd_val_pred = clf.predict(text_feature_vec_val)
rnd_train_pred = clf.predict(text_feature_vec_train)
print("Validation accuracy score: ", accuracy_score(rnd_val_pred, y_val))
print("Train accuracy score: ", accuracy_score(rnd_train_pred, y_train))

In [20]:
# Save proba:
svm_tfidf_train_proba = clf.predict_proba(X_train)
svm_tfidf_val_proba = clf.predict_proba(X_val)
svm_tfidf_test_proba = clf.predict_proba(X_test)

with open('data/svm_tfidf_train_proba.pkl', 'wb') as f:
    pickle.dump(svm_tfidf_train_proba, f)
    
with open('data/svm_tfidf_val_proba.pkl', 'wb') as f:
    pickle.dump(svm_tfidf_val_proba, f)
    
with open('data/svm_tfidf_test_proba.pkl', 'wb') as f:
    pickle.dump(svm_tfidf_test_proba, f)

AttributeError: predict_proba is not available when  probability=False

In [None]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(min_df= 5, tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = SVC(probability=True, gamma=0.33, C=10)

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

##### XGBOOST - Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from xgboost import XGBClassifier

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = TfidfVectorizer(min_df= 5, ngram_range=(1,1))
classifier = XGBClassifier(learning_rate=1, max_depth=40)

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

##### XGBOOST - bow - TBD

In [None]:
#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(min_df= 5,  ngram_range=(1,1))
classifier = XGBClassifier(learning_rate=1, max_depth=40)

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])


# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data_val = pipe.predict([x[0] for x in val]) 
pred_data_train = pipe.predict([x[0] for x in train]) 

print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))