In [None]:
#!pip install --quiet tqdm

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [37]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm import tqdm

In [5]:
pwd

'/home/jovyan/capstone-52/topic_modeling_experiments/classifiers'

In [6]:
cd ../../../capstone-52/Pickled_from_mongo/

/home/jovyan/capstone-52/Pickled_from_mongo


In [7]:
df = pd.read_pickle('../Pickled_from_mongo/combined_eg_gulf_200k_sample.p')

In [8]:
df.shape

(192936, 5)

In [9]:
df.head()

Unnamed: 0,_id,cleaned_geo,cleaned_name,cleaned_text,class
0,5a2c7a44204c9e0400cdc0e2,,kamal,أزاي أقول لك كنا زمان والماضي كان فى الغيب بكر...,EG
1,5a2c7a44204c9e0400cdc0e3,Egypt,agabdelrehim,هي آراء آه بس أزاي أجويرو منتهي يعني أمال لو م...,EG
2,5a2c7a44204c9e0400cdc0e4,,ElsndubadE,أنت صيني أزاي تقارن شادي بالخطيب ألي هو الوحيد...,EG
3,5a2c7a44204c9e0400cdc0e5,,h_sawires,أزاي الناس كانت بتغرد في الخمسينات قبل إختراع ...,EG
4,5a2c7a44204c9e0400cdc0e6,Egypt,Hagerelmor,التوينز اللي معاها كل الحلو والوحش والمصايب وا...,EG


### Label Encode the Categories

In [11]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

## Ridge Classifier

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['class_numerical'], random_state = 42)

In [18]:
X_train.iloc[6]

'يا جمهور سلطان الدررع والمراكز الاولى نبيها صوتو لسلطان رقم رصيدك'

In [19]:
params = [(ngr, mindf, maxdf, alpha)
          for ngr in [(1,2)]
          for mindf in [1,2,4,8]
          for maxdf in np.linspace(.95,.999,2)
          for alpha in np.logspace(-1,3,2)
         ]

skfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

results_list = []

for ngr, mindf, maxdf, alpha in tqdm(params):
    results = {
        'ngram_range' : ngr,
        'min_df' : mindf,
        'max_df' : maxdf,
        'alpha' : alpha
    }
    train_scores = list()
    val_scores = list()
    
    for train_indices, val_indices in skfold.split(X_train.astype('str'), y_train):
        
        X_train_kf, y_train_kf = X_train.iloc[train_indices], y_train.iloc[train_indices]
        X_val_kf, y_val_kf = X_train.iloc[val_indices], y_train.iloc[val_indices]
        
        ridge_clf_pipe = Pipeline([
                                ('tfidf', TfidfVectorizer(ngram_range=ngr, min_df=mindf, max_df=maxdf)),
                                ('svd', TruncatedSVD(50)),
                                ('clf', RidgeClassifier(alpha=alpha))
                            ])
        
        ridge_clf_pipe.fit(X_train_kf, y_train_kf)
        
        train_scores.append(ridge_clf_pipe.score(X_train_kf, y_train_kf))
        val_scores.append(ridge_clf_pipe.score(X_val_kf, y_val_kf))
        
    results['mean_train_score'] = np.mean(train_scores)
    results['mean_val_score'] = np.mean(val_scores)
    results_list.append(results)

cv_results = pd.DataFrame(results_list)

cv_results.sort_values('mean_val_score', ascending=False, inplace=True)

alpha, maxdf, _, _, mindf, ngr = cv_results.iloc[0].values

cv_results.head()

100%|██████████| 16/16 [08:07<00:00, 30.45s/it]


Unnamed: 0,alpha,max_df,mean_train_score,mean_val_score,min_df,ngram_range
4,0.1,0.95,0.764606,0.765967,2,"(1, 2)"
10,0.1,0.999,0.762657,0.762374,4,"(1, 2)"
6,0.1,0.999,0.762235,0.761904,2,"(1, 2)"
2,0.1,0.999,0.759146,0.761275,1,"(1, 2)"
0,0.1,0.95,0.757225,0.759561,1,"(1, 2)"


In [None]:
ridge_clf_pipe = Pipeline([
                        ('tfidf', TfidfVectorizer(ngram_range=ngr, min_df=mindf, max_df=maxdf)),
                        ('svd', TruncatedSVD(100)),
                        ('clf', RidgeClassifier(alpha=alpha))
                    ])

ridge_clf_pipe.fit(X_train, y_train)

In [20]:
ridge_clf_pipe.score(X_train, y_train)

0.75124048043565395

In [21]:
ridge_clf_pipe.score(X_test, y_test)

0.75106771157274954

In [22]:
print(classification_report(y_train, ridge_clf_pipe.predict(X_train)))
print(classification_report(y_test, ridge_clf_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.82      0.65      0.72     72924
          1       0.70      0.86      0.77     71778

avg / total       0.76      0.75      0.75    144702

             precision    recall  f1-score   support

          0       0.82      0.65      0.72     24328
          1       0.70      0.86      0.77     23906

avg / total       0.76      0.75      0.75     48234



In [53]:
predictions = ridge_clf_pipe.predict(X_test)

In [52]:
proba_predictions = ridge_clf_pipe.predict_proba(X_test)

AttributeError: 'RidgeClassifier' object has no attribute 'predict_proba'

In [24]:
confmat = confusion_matrix(y_test, predictions)

print(confmat)

[[15746  8582]
 [ 3425 20481]]


In [None]:
lr.predict_proba(X_train)[:, 1].shape

In [26]:
fpr, tpr, thresholds = roc_curve(y_train, ridge_clf_pipe.predict(X_train)[:,1])

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

IndexError: too many indices for array

## Support Vector Machine (SGD)

In [54]:
sgd_clf_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='modified_huber', penalty='l2',
                                           alpha=1e-3, random_state=42)),
                    ])

sgd_clf_pipe.fit(X_train, y_train)  

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))])

In [55]:
x_test_pred_proba = sgd_clf_pipe.predict_proba(X_test)

In [63]:
x_test_pred_proba[:5][:,0]

array([ 0.74889722,  0.7496778 ,  0.53591916,  0.29060866,  0.39606878])

In [71]:
X_test_preds_thresh_45 = x_test_pred_proba[:,0] < .45

In [70]:
X_test_preds_thresh_60[:5].astype(int)

array([0, 0, 0, 1, 1])

In [57]:
y_test[:5]

45404    0
84725    0
28308    0
50458    1
2451     1
Name: class_numerical, dtype: int64

In [115]:
print(classification_report(y_test, x_test_pred_proba[:,0]<.44))

             precision    recall  f1-score   support

          0       0.84      0.83      0.83     24328
          1       0.83      0.84      0.83     23906

avg / total       0.83      0.83      0.83     48234



In [127]:
wrong_mask = y_test != (x_test_pred_proba[:,0]<.45).astype(int)

In [None]:
# lower scores mask
#confidence_mask = (x_test_pred_proba[:,0]<.55)|(x_test_pred_proba[:,0]>.45)

In [128]:
confidence_mask = (x_test_pred_proba[:,0]<.2)|(x_test_pred_proba[:,0]>.8)

In [129]:
print(classification_report(y_test[confidence_mask], x_test_pred_proba[confidence_mask,0]<.5))

             precision    recall  f1-score   support

          0       0.99      0.98      0.98      7261
          1       0.95      0.96      0.95      2425

avg / total       0.98      0.98      0.98      9686



array([0, 1, 0, ..., 1, 0, 1])

In [None]:
def predict(model, X, threshold):
    proba = model.predict_proba(X)
    return proba[:,0] < threshold

In [39]:
sgd_clf_pipe.score(X_train, y_train)

0.73814460062749654

In [42]:
sgd_clf_pipe.score(X_test, y_test)

0.743521167641083

In [43]:
print(classification_report(y_train, sgd_clf_pipe.predict(X_train)))
print(classification_report(y_test, sgd_clf_pipe.predict(X_test)))

             precision    recall  f1-score   support

          0       0.95      0.51      0.66     72924
          1       0.66      0.97      0.79     71778

avg / total       0.81      0.74      0.72    144702

             precision    recall  f1-score   support

          0       0.93      0.53      0.68     24328
          1       0.67      0.96      0.79     23906

avg / total       0.80      0.74      0.73     48234



## MultinomialNB

In [45]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['cleaned_text'])
X_train_counts.shape

(192936, 223045)

In [46]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(192936, 223045)

In [47]:
multinomialnb_clf = MultinomialNB().fit(X_train_tfidf, df['class'])

In [48]:
multinomialnb_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

In [49]:
multinomialnb_clf.fit(df['cleaned_text'], df['class'])

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [50]:
print(classification_report(y_train, multinomialnb_clf.predict(X_train)))
print(classification_report(y_test, multinomialnb_clf.predict(X_test)))

ValueError: Mix of label input types (string and number)