In [109]:
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

## 1: Face Recognition, but not evil this time

Using the faces dataset in:

```
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
```

If you use the `faces.target` and `faces.target_names` attributes, you can build a facial recognition algorithm.

Use sklearn **gridsearch** (or an equivalent, like random search) to optimize the model for accuracy. Try both a SVM-based classifier and a logistic regression based classifier (with a feature pipeline of your choice) to get the best model. You should have at least 80% accuracy.

In [110]:
faces = fetch_lfw_people(min_faces_per_person=60)

pca = PCA(n_components = 200, random_state=2, whiten=True)
svc = SVC(kernel='rbf', class_weight="balanced")

In [111]:
#make pipeline 
svc_pipeline = make_pipeline(pca,svc)

In [112]:
#training & Test
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42)

In [113]:
svc_param_grid = {'svc__C': [1, 5, 10, 50],
                  'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}b

SyntaxError: invalid syntax (<ipython-input-113-19a9391d0898>, line 2)

In [24]:
svc_grid = GridSearchCV(svc_pipeline, svc_param_grid)

In [33]:
svc_grid.fit(Xtrain, ytrain)
svc_ypred = svc_grid.predict(Xtest)

In [37]:
print(svc_grid.best_params_)
print(svc_grid.best_score_)
print(accuracy_score(ytest, svc_ypred))
print(classification_report(ytest, svc_ypred))

{'svc__C': 10, 'svc__gamma': 0.001}
0.8199775642588889
0.8605341246290801
              precision    recall  f1-score   support

           0       0.81      0.87      0.84        15
           1       0.78      0.91      0.84        68
           2       0.77      0.77      0.77        31
           3       0.89      0.86      0.87       126
           4       0.83      0.83      0.83        23
           5       1.00      0.75      0.86        20
           6       0.86      1.00      0.92        12
           7       0.97      0.88      0.93        42

    accuracy                           0.86       337
   macro avg       0.87      0.86      0.86       337
weighted avg       0.87      0.86      0.86       337



In [41]:
# Logistic Regression 
lr = LogisticRegression()
lr_pipeline = make_pipeline(pca, lr)

In [42]:
lr_param_grid = {"logisticregression__C": [0.25, 0.5, 1, 5, 10],
                "logisticregression__penalty": ["l2","none"]}

lr_grid = GridSearchCV(lr_pipeline, lr_param_grid)               

In [43]:

lr_grid.fit(Xtrain, ytrain)
lr_grid.best_params_

{'logisticregression__C': 0.5, 'logisticregression__penalty': 'l2'}

In [45]:
lr_ypred = lr_grid.predict(Xtest)
print(classification_report(ytest,lr_ypred))

              precision    recall  f1-score   support

           0       0.79      0.73      0.76        15
           1       0.82      0.87      0.84        68
           2       0.72      0.58      0.64        31
           3       0.84      0.81      0.82       126
           4       0.59      0.70      0.64        23
           5       0.81      0.65      0.72        20
           6       0.85      0.92      0.88        12
           7       0.79      0.90      0.84        42

    accuracy                           0.80       337
   macro avg       0.78      0.77      0.77       337
weighted avg       0.80      0.80      0.79       337



# 2: Bag of Words, Bag of Popcorn

By this point, you are ready for the [Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

Use NLP feature pre-processing (using, SKLearn, Gensim, Spacy or Hugginface) to build the best classifier you can. Use a  feature pipeline, and gridsearch for your final model.

A succesful project should get 90% or more on a **holdout** dataset you kept for yourself.

In [49]:
df = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [50]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [53]:
import string
#cleaning review text
df['review'] = df['review'].str.replace('<[^<]+?>', '', regex=True)
df['review'] = df['review'].str.replace('[^a-zA-Z]', ' ', regex=True).str.lower()
df['review'] = df['review'].str.replace('  ', ' ', regex=True)


0         with all this stuff going down at the moment ...
1         the classic war of the worlds by timothy hine...
2         the film starts with a manager nicholas bell ...
3         it must be assumed that those who praised thi...
4         superbly trashy and wondrously unpretentious ...
                               ...                        
24995     it seems like more consideration has gone int...
24996     i don t believe they made this film completel...
24997     guy is a loser can t get girls needs to build...
24998     this minute documentary bu uel made in the ea...
24999     i saw this movie as a child and it broke my h...
Name: review, Length: 25000, dtype: object

In [108]:
#Forest Classifier

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)
forest = RandomForestClassifier()

In [57]:
pipe = Pipeline([("vec", vectorizer), ("forest", forest)])

In [59]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df["sentiment"], test_size=0.2, random_state=42)

In [60]:
grid_params = {"forest__n_estimators": [64, 100, 128]} 
grid = GridSearchCV(pipe, grid_params, cv=5)

In [62]:
grid.fit(X_train, y_train)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vec',
                                        CountVectorizer(max_features=5000)),
                                       ('forest', RandomForestClassifier())]),
             param_grid={'forest__n_estimators': [64, 100, 128]})

In [63]:
y_pred = grid.predict(X_test)

In [66]:
print(grid.best_params_)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

{'forest__n_estimators': 128}
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      2481
           1       0.84      0.86      0.85      2519

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

0.8492
