In [None]:
#!pip install --quiet tqdm

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from tqdm import tqdm

In [None]:
pwd

In [None]:
cd ../../../capstone-52/Pickled_from_mongo/

In [None]:
df = pd.read_pickle('../Pickled_from_mongo/combined_eg_gulf_200k_sample.p')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df['cleaned_text'])
X_train_counts.shape

In [None]:
type(X_train_counts)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

### Ridge Classifier Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['class_numerical'], random_state = 42)

In [None]:
X_train.iloc[6]

In [None]:
params = [(ngr, mindf, maxdf, alpha)
          for ngr in [(1,2)]
          for mindf in [1,2,4,8]
          for maxdf in np.linspace(.95,.999,2)
          for alpha in np.logspace(-1,3,2)
         ]

skfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

results_list = []

for ngr, mindf, maxdf, alpha in tqdm(params):
    results = {
        'ngram_range' : ngr,
        'min_df' : mindf,
        'max_df' : maxdf,
        'alpha' : alpha
    }
    train_scores = list()
    val_scores = list()
    
    for train_indices, val_indices in skfold.split(X_train.astype('str'), y_train):
        
        X_train_kf, y_train_kf = X_train.iloc[train_indices], y_train.iloc[train_indices]
        X_val_kf, y_val_kf = X_train.iloc[val_indices], y_train.iloc[val_indices]
        
        lsa_pipe = Pipeline([
                                ('tfidf', TfidfVectorizer(ngram_range=ngr, min_df=mindf, max_df=maxdf)),
                                ('svd', TruncatedSVD(50)),
                                ('clf', RidgeClassifier(alpha=alpha))
                            ])
        
        lsa_pipe.fit(X_train_kf, y_train_kf)
        
        train_scores.append(lsa_pipe.score(X_train_kf, y_train_kf))
        val_scores.append(lsa_pipe.score(X_val_kf, y_val_kf))
        
    results['mean_train_score'] = np.mean(train_scores)
    results['mean_val_score'] = np.mean(val_scores)
    results_list.append(results)

cv_results = pd.DataFrame(results_list)

cv_results.sort_values('mean_val_score', ascending=False, inplace=True)

alpha, maxdf, _, _, mindf, ngr = cv_results.iloc[0].values

cv_results.head()

In [None]:
lsa_pipe = Pipeline([
                        ('tfidf', TfidfVectorizer(ngram_range=ngr, min_df=mindf, max_df=maxdf)),
                        ('svd', TruncatedSVD(100)),
                        ('clf', RidgeClassifier(alpha=alpha))
                    ])

history = lsa_pipe.fit(X_train, y_train)

In [None]:
lsa_pipe.score(X_train, y_train)

In [None]:
lsa_pipe.score(X_test, y_test)

In [None]:
y_pred = lsa_pipe.predict(X_test)

In [None]:
print(classification_report(y_train, lsa_pipe.predict(X_train)))
print(classification_report(y_test, lsa_pipe.predict(X_test)))

In [None]:
predictions = lsa_pipe.predict(X_test)

In [None]:
confmat = confusion_matrix(y_test, predictions)

print(confmat)

In [None]:
lr.predict_proba(X_train)[:, 1].shape

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, lsa_pipe.predict(X_train)[:,1])

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

### MultinomialNB

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, df['class'])

#### Conduct search using two different dialects and see if predicted class is accurate

In [None]:
docs_new = ['الثوره المصريه تحولت من ثورة شارع محدش يزعل', 'ذويه ارفضوا لانه عيار جمبازي مافيه شي وبليس مايكسر اماعينه يامال لضعفه قطو بو سبعة ارواح']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r %s' % (doc, df['class'].sample(10)))
    

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

In [None]:
text_clf.fit(df['cleaned_text'], df['class']) 

#### Evaluating predictive accuracy of the model.

Tagged tweets from both EG and GULF classes that have not been seen by the training data.

In [None]:
search_sentences = [
{"sentence": "الثوره المصريه تحولت من ثورة شارع محدش يزعل", "class" : "EG"},
{"sentence": "نفسي اكون زيك بعرف اطنشك أو اخليك اخر حاجة و بعد كده اضحك عليك بكلمتين و انت تصدق كل مرة عادي", "class" : "EG"},
{"sentence": "بما أن أغلب اللي متابعني مش بقدر اوصلهم أغلب الوقت. . ف كل يوم هعمل تويته آخر اليوم اللي هيعمل لايك", "class" : "EG"},
{"sentence": "مقاومتنا للأشياء طلعت بتقل مع الزمن، مبقيناش نناهد ف حاجة.. و مش عشان أحنا جامدين قوي. هو حيلنا بس", "class" : "EG"},
{"sentence": "عارف ايه احلى حاجة حاصلة ليا انى منك وانت برضه بتجرى فيا انت اخر كل يوم باخدك ف حضنى وانت اول", "class" : "EG"},
{"sentence": "القاضى اللى حكم على المعتقلين بالاعدام هو هو نفس القاضى اللى هيراقب الانتخابات", "class" : "EG"},
{"sentence": "وفجأة تيجي سيرة حاجة في وسط الكلام تقلب عليك القديم والجديد وترسم في دماغك علامات استفهام مالهاش", "class" : "EG"},
{"sentence": "السنة اللي فاتت الاعلام الانجليزي قال المفروض بيب يعرف انه في البريمييرليج لازم يتأقلم و يلعب كورتنا", "class" : "EG"},
{"sentence": "حرب و قتال و ناس تموت و هذا الدلخ يقول سعيد و مثل أجواء كرة القدم ", "class" : "GULF"},
{"sentence": "من غباء الهلالي الدلخ اللي يفتخر بفوز فريقه من قيادة رئيس الحكام كلاتنبيرغ له سنه ماسنع الحكام السعوديين", "class" : "GULF"},
{"sentence": "شفتوا هوشة شيعان وغالي لو هي بين الهلاليين كان شفتوا هاشتاق كبر راسهم المنسم وكان جاك هذا الدلخ ", "class" : "GULF"},
{"sentence": "ذويه ارفضوا لانه عيار جمبازي مافيه شي وبليس مايكسر اماعينه يامال لضعفه قطو بو سبعة ارواح ", "class" : "GULF"},
{"sentence": "صج ياجماعه في سوال محيرني ليش المتان مافيهم النفسيه عكس الضعاف تقول خاشوقه ومنفس", "class" : "GULF"},
{"sentence": "لم نعاند التاريخ مسيو خاشوقه بل الواقع والعقلانية ابعدنا من التدمير والانفلات", "class" : "GULF"},
{"sentence": "أي والله وعندي عنه ابو خاشوقة أسرار لا تشرف قد أقولها اذا لم يلجم لسانه عن سب وطني", "class" : "GULF"},
{"sentence": "قبل ماتتكلمين يالطيبه افهمي السالفه ومنب ملزومه بسنابي اني اشرح كل شيء صارت بالتفصيل بس لانك قلق خل", "class" : "GULF"},
]

In [None]:
search_sentences_df = pd.DataFrame(search_sentences)

In [None]:
search_sentences_df.sample(4)

In [None]:
docs_test = search_sentences_df['sentence']
predicted = text_clf.predict(docs_test)
np.mean(predicted == search_sentences_df['class'])  

__MultinomialNB prediction score__

### Support Vector Machine (SVM)

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42)),
                    ])
text_clf.fit(df['cleaned_text'], df['class'])  

predicted = text_clf.predict(docs_test)
np.mean(predicted == search_sentences_df['class']) 

__Precision/Recall and Confusion Matrix for SVM__

In [None]:
print(metrics.classification_report(search_sentences_df['class'], predicted, 
                                    target_names=['EG',"GULF"]))

In [None]:
metrics.confusion_matrix(search_sentences_df['class'], predicted)

In [None]:
score(X, y[, sample_weight])

__Parameter tuning using grid search__

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(df['cleaned_text'], df['class'])

__Using as a classifier to predict dialect__

In [None]:
search_sentences_df['class'][gs_clf.predict(['الثوره المصريه تحولت من ثورة شارع محدش يزعل'])]

In [None]:
gs_clf.best_score_  

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
cv_results_df = pd.DataFrame(gs_clf.cv_results_)

In [None]:
high_mean_test_mask = cv_results_df[cv_results_df['mean_test_score'] >= 0.70]

In [None]:
high_mean_test_mask

## KNN

In [None]:
le = LabelEncoder()
df['class_numerical'] = le.fit_transform(df['class'])

In [None]:
df = df.drop(['cleaned_geo','cleaned_name', 'class'], axis=1)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['class_numerical'], random_state = 42)

In [None]:
params = [(ngr)
          for ngr in [(1,2)]
         ]

skfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

results_list = []

for ngr in tqdm(params):
    results = {
        'ngram_range' : ngr,
    }
    train_scores = list()
    val_scores = list()
    
    for train_indices, val_indices in skfold.split(X_train.astype('str'), y_train):
        
        X_train_kf, y_train_kf = X_train.iloc[train_indices], y_train.iloc[train_indices]
        X_val_kf, y_val_kf = X_train.iloc[val_indices], y_train.iloc[val_indices]
        
        lsa_pipe = Pipeline([
                                ('tfidf', TfidfVectorizer(ngram_range=ngr)),
                                ('svd', TruncatedSVD(50)),
                                ('clf', KNeighborsClassifier())
                            ])
        
        lsa_pipe.fit(X_train_kf, y_train_kf)
        
        train_scores.append(lsa_pipe.score(X_train_kf, y_train_kf))
        val_scores.append(lsa_pipe.score(X_val_kf, y_val_kf))
        
    results['mean_train_score'] = np.mean(train_scores)
    results['mean_val_score'] = np.mean(val_scores)
    results_list.append(results)

cv_results = pd.DataFrame(results_list)

cv_results.sort_values('mean_val_score', ascending=False, inplace=True)

ngr = cv_results.iloc[0].values

cv_results.head()

In [None]:
knn_pipe = Pipeline([
                        ('tfidf', TfidfVectorizer()),
                        ('svd', TruncatedSVD(100)),
                        ('clf', KNeighborsClassifier())
                    ])

history = knn_pipe.fit(X_train, y_train)

In [None]:
history.score(X_train, y_test)

In [None]:
lr.predict_proba(X)

In [None]:
predictions = []

kNearestNeighbor(X_train, y_train, X_test, predictions, 7)

# transform the list into an array
predictions = np.asarray(predictions)

# evaluating accuracy
accuracy = accuracy_score(y_test, predictions)
print('\nThe accuracy of our classifier is %d%%' % accuracy*100)

In [None]:
# we create an instance of Neighbours Classifier and fit the data.
#knn.fit(X, Y)

# Plot the decision boundary. For that, we will asign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X_train[:,0].min() - .5, X_train[:,0].max() + .5
y_min, y_max = X_train[:,1].min() - .5, X_train[:,1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
pl.figure(1, figsize=(4, 3))
pl.set_cmap(pl.cm.Paired)
pl.pcolormesh(xx, yy, Z)

# Plot also the training points
pl.scatter(X_train[:,0], X_train[:,1],c=y_train )
pl.xlabel('Sepal length')
pl.ylabel('Sepal width')

pl.xlim(xx.min(), xx.max())
pl.ylim(yy.min(), yy.max())
pl.xticks(())
pl.yticks(())

pl.show()

In [None]:
lsa_pipe.predict(X_test, y_test)