<a href="https://colab.research.google.com/github/trinhvu1711/ML_Semester2_2023/blob/main/Lab_8_20130471_TrinhLongVu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



# Import libraries

In [39]:
# code
from sklearn import datasets
from sklearn import svm
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
iris = datasets.load_iris()
X = iris['data']
y = iris['target']
# print(iris)
# X, y without Feature Selection.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [None]:
#code
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
svc = svm.SVC()
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(classification_report(y_test, predictions))

clf = GridSearchCV(SVC(), param_grid, refit=True, scoring = 'accuracy')
clf.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)
y_pred = svc.predict(X_test)
svmac = round(accuracy_score(y_test, y_pred), 2)
svmpc = round(precision_score(y_test, y_pred, average='micro'), 2)
svmrc= round(recall_score(y_test, y_pred, average='micro'), 2)
svmf= round(f1_score(y_test, y_pred, average='micro'), 2)
print ("Accuracy : ", svmac) 
print ("Precision : ", svmpc) 
print ("Recall score : ", svmrc) 
print ("F1 score : ", svmf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        17
           2       1.00      1.00      1.00        14

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_C', 'param_gamma', 'param_kernel', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')
0.980952380952381
Accuracy :  1.0
Precision :  1.0
Recall score :  1.0
F1 score :  1.0


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
#code#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(classification_report(y_test, predictions))

clf = GridSearchCV(KNeighborsClassifier(), grid_params,scoring = 'accuracy', verbose = 1, cv=3, n_jobs = -1, refit = True)
clf.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)

y_pred_knn =  knn.predict(X_test)
knnac =round(accuracy_score(y_test, y_pred_knn), 4)
knnpc =round(precision_score(y_test, y_pred_knn, average = 'micro'), 4)
knnrc=round(recall_score(y_test, y_pred_knn, average = 'micro'), 4)
knnf=round(f1_score(y_test, y_pred_knn, average = 'micro'), 4)
print('kNN Classifier:')
print('Accuracy:', knnac)
print('Precision:', knnpc)
print('Recall:', knnrc)
print('F1-score:', knnf)
print()



              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        17
           2       0.93      1.00      0.97        14

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

Fitting 3 folds for each of 36 candidates, totalling 108 fits
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
KNeighborsClassifier(weights='distance')
0.9619047619047619
kNN Classifier:
Accuracy: 0.9778
Precision: 0.9778
Recall: 0.9778
F1-score: 0.9778



*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
#code
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
rfcsf=RandomForestClassifier()
#Train the model using the training sets
rfcsf.fit(X_train,y_train)
y_pred = rfcsf.predict(X_test)
print(classification_report(y_test, y_pred))
CV_rfc = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        17
           2       0.93      1.00      0.97        14

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'uniform'}
KNeighborsClassifier()
0.98


In [None]:
rfac = round(accuracy_score(y_test, y_pred), 4)
rfpc = round(precision_score(y_test, y_pred, average='micro'), 4)
rfrc= round(recall_score(y_test, y_pred, average='micro'), 4)
rff= round(f1_score(y_test, y_pred, average='micro'), 4)
print ("Accuracy : ", rfac) 
print ("Precision : ", rfpc) 
print ("Recall score : ", rfrc) 
print ("F1 score : ", rff)

Accuracy :  0.9778
Precision :  0.9778
Recall score :  0.9778
F1 score :  0.9778


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [None]:
table1 = PrettyTable()
table1.field_names = ["Classification algorithm", "Accuracy", "Precision", "Recall", "F1-score"]
table1.add_row(["Random Forest", rfac, rfpc, rfrc, rff])
table1.add_row(["KNN", knnac, knnpc, knnrc, knnf])
table1.add_row(["SVM",svmac, svmpc, svmrc, svmf ])
table1.title = 'Results using GridSearchCV'
print(table1)

+--------------------------+----------+-----------+--------+----------+
| Classification algorithm | Accuracy | Precision | Recall | F1-score |
+--------------------------+----------+-----------+--------+----------+
|      Random Forest       |  0.9778  |   0.9778  | 0.9778 |  0.9778  |
|           KNN            |  0.9778  |   0.9778  | 0.9778 |  0.9778  |
|           SVM            |   1.0    |    1.0    |  1.0   |   1.0    |
+--------------------------+----------+-----------+--------+----------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [29]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
cancer = datasets.load_breast_cancer()

In [43]:
X = cancer['data']
y = cancer['target']
# print(iris)
# X, y without Feature Selection.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 10)

*   2.1. Apply **GridSearchCV** to **SVM** 


In [44]:
# code
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
svc = svm.SVC()
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(classification_report(y_test, predictions))

clf = GridSearchCV(SVC(), param_grid, refit=True, scoring = 'accuracy')
clf.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)
y_pred = svc.predict(X_test)
svmac = round(accuracy_score(y_test, y_pred), 2)
svmpc = round(precision_score(y_test, y_pred, average='micro'), 2)
svmrc= round(recall_score(y_test, y_pred, average='micro'), 2)
svmf= round(f1_score(y_test, y_pred, average='micro'), 2)
print ("Accuracy : ", svmac) 
print ("Precision : ", svmpc) 
print ("Recall score : ", svmrc) 
print ("F1 score : ", svmf)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        59
           1       1.00      0.97      0.99       112

    accuracy                           0.98       171
   macro avg       0.98      0.99      0.98       171
weighted avg       0.98      0.98      0.98       171

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_C', 'param_gamma', 'param_kernel', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=1000, gamma=0.0001)
0.9824050632911392
Accuracy :  0.98
Precision :  0.98
Recall score :  0.98
F1 score :  0.98


*   2.2. Apply **GridSearchCV** to **kNN** 

In [45]:
#code
#code#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
print(classification_report(y_test, predictions))

clf = GridSearchCV(KNeighborsClassifier(), grid_params,scoring = 'accuracy', verbose = 1, cv=3, n_jobs = -1, refit = True)
clf.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)

y_pred_knn =  knn.predict(X_test)
knnac =round(accuracy_score(y_test, y_pred_knn), 4)
knnpc =round(precision_score(y_test, y_pred_knn, average = 'micro'), 4)
knnrc=round(recall_score(y_test, y_pred_knn, average = 'micro'), 4)
knnf=round(f1_score(y_test, y_pred_knn, average = 'micro'), 4)
print('kNN Classifier:')
print('Accuracy:', knnac)
print('Precision:', knnpc)
print('Recall:', knnrc)
print('F1-score:', knnf)
print()


              precision    recall  f1-score   support

           0       0.98      0.97      0.97        59
           1       0.98      0.99      0.99       112

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

Fitting 3 folds for each of 36 candidates, totalling 108 fits
['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'metric': 'minkowski', 'n_neighbors': 7, 'weights': 'distance'}
KNeighborsClassifier(n_neighbors=7, weights='distance')
0.9573175362649047
kNN Classifier:
Accuracy: 0.9825
Precision: 0.9825
Recall: 0.9825
F1-score: 0.9825



*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [51]:
#code
clf_lr = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=1000)
clf_lr.fit(X_train, y_train)
param_grid = {'C': [0.01, 0.1, 1, 10]}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, refit= True)
grid_search.fit(X_train, y_train)
sorted(grid_search.cv_results_.keys())

# print best parameter after tuning
print(grid_search.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_search.best_estimator_)
print(clf.best_estimator_)
print(clf.best_score_)

y_pred_lr = clf_lr.predict(X_test)
lrac = round(accuracy_score(y_test, y_pred_lr), 2)
lrpc = round(precision_score(y_test, y_pred_lr), 2)
lrrc= round(recall_score(y_test, y_pred_lr), 2)
lrf= round(f1_score(y_test, y_pred_lr), 2)
print('Logistic Regression Classifier:')
print('Accuracy:', lrac)
print('Precision:', lrpc)
print('Recall:', lrrc)
print('F1-score:', lrf)
print()

{'C': 1}
LogisticRegression(C=1)
KNeighborsClassifier(n_neighbors=7, weights='distance')
0.9573175362649047
Logistic Regression Classifier:
Accuracy: 0.96
Precision: 0.98
Recall: 0.96
F1-score: 0.97



*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [36]:
#code
#code
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
rfcsf=RandomForestClassifier()
#Train the model using the training sets
rfcsf.fit(X_train,y_train)
y_pred = rfcsf.predict(X_test)
print(classification_report(y_test, y_pred))
CV_rfc = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)
rfac = round(accuracy_score(y_test, y_pred), 4)
rfpc = round(precision_score(y_test, y_pred, average='micro'), 4)
rfrc= round(recall_score(y_test, y_pred, average='micro'), 4)
rff= round(f1_score(y_test, y_pred, average='micro'), 4)
print ("Accuracy : ", rfac) 
print ("Precision : ", rfpc) 
print ("Recall score : ", rfrc) 
print ("F1 score : ", rff)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        59
           1       1.00      0.97      0.99       112

    accuracy                           0.98       171
   macro avg       0.98      0.99      0.98       171
weighted avg       0.98      0.98      0.98       171

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
KNeighborsClassifier(metric='manhattan', n_neighbors=7)
0.9371724766461608
Accuracy :  0.9825
Precision :  0.9825
Recall score :  0.9825
F1 score :  0.9825


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [52]:
#code
table2 = PrettyTable()
table2.field_names = ["Classification algorithm", "Accuracy", "Precision", "Recall", "F1-score"]
table2.add_row(["Random Forest", rfac, rfpc, rfrc, rff])
table2.add_row(["KNN", knnac, knnpc, knnrc, knnf])
table2.add_row(["SVM",svmac, svmpc, svmrc, svmf ])
table2.add_row(["LogisticRegression",lrac, lrpc, lrrc, lrf ])
table2.title = 'Results using GridSearchCV'
print(table2)

+--------------------------+----------+-----------+--------+----------+
| Classification algorithm | Accuracy | Precision | Recall | F1-score |
+--------------------------+----------+-----------+--------+----------+
|      Random Forest       |  0.9825  |   0.9825  | 0.9825 |  0.9825  |
|           KNN            |  0.9825  |   0.9825  | 0.9825 |  0.9825  |
|           SVM            |   0.98   |    0.98   |  0.98  |   0.98   |
|    LogisticRegression    |   0.96   |    0.98   |  0.96  |   0.97   |
+--------------------------+----------+-----------+--------+----------+


#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [1]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   3.2. Movie reviews information

In [2]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   3.3. Create dataset from movie reviews

In [3]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [4]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [5]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   3.4. Train test split

In [14]:
train, test = train_test_split(documents, test_size = 0.4, random_state=42)

In [15]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 615, 'pos': 585})
Counter({'pos': 415, 'neg': 385})


In [16]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [18]:
# code
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']}
svc = svm.SVC()
svc.fit(X_train_bow, y_train)
predictions = svc.predict(X_test_bow)
print(classification_report(y_test, predictions))

clf = GridSearchCV(SVC(), param_grid, refit=True, scoring = 'accuracy')
clf.fit(X_train_bow, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)
y_pred = svc.predict(X_test_bow)
svmac = round(accuracy_score(y_test, y_pred), 2)
svmpc = round(precision_score(y_test, y_pred, average='micro'), 2)
svmrc= round(recall_score(y_test, y_pred, average='micro'), 2)
svmf= round(f1_score(y_test, y_pred, average='micro'), 2)
print ("Accuracy : ", svmac) 
print ("Precision : ", svmpc) 
print ("Recall score : ", svmrc) 
print ("F1 score : ", svmf)

              precision    recall  f1-score   support

         neg       0.77      0.82      0.79       385
         pos       0.82      0.78      0.80       415

    accuracy                           0.80       800
   macro avg       0.80      0.80      0.80       800
weighted avg       0.80      0.80      0.80       800

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_C', 'param_kernel', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'C': 1, 'kernel': 'linear'}
SVC(C=1, kernel='linear')
0.8291666666666666
Accuracy :  0.8
Precision :  0.8
Recall score :  0.8
F1 score :  0.8


*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [19]:
#code
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}
rfcsf=RandomForestClassifier()
#Train the model using the training sets
rfcsf.fit(X_train_bow,y_train)
y_pred = rfcsf.predict(X_test_bow)
print(classification_report(y_test, y_pred))
CV_rfc = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_bow, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)
rfac = round(accuracy_score(y_test, y_pred), 4)
rfpc = round(precision_score(y_test, y_pred, average='micro'), 4)
rfrc= round(recall_score(y_test, y_pred, average='micro'), 4)
rff= round(f1_score(y_test, y_pred, average='micro'), 4)
print ("Accuracy : ", rfac) 
print ("Precision : ", rfpc) 
print ("Recall score : ", rfrc) 
print ("F1 score : ", rff)

              precision    recall  f1-score   support

         neg       0.72      0.85      0.78       385
         pos       0.83      0.69      0.76       415

    accuracy                           0.77       800
   macro avg       0.78      0.77      0.77       800
weighted avg       0.78      0.77      0.77       800

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_C', 'param_kernel', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'C': 1, 'kernel': 'linear'}
SVC(C=1, kernel='linear')
0.8291666666666666
Accuracy :  0.7675
Precision :  0.7675
Recall score :  0.7675
F1 score :  0.7675


*   3.8. Apply **kNN** with **GridSearchCV** 

In [20]:
#code
param_grid = {
    'n_neighbors': [3, 5, 7],
    'p': [1, 2],
}
knn = KNeighborsClassifier()
knn.fit(X_train_bow, y_train)
predictions = knn.predict(X_test_bow)
print(classification_report(y_test, predictions))

clf = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, refit = True)
clf.fit(X_train_bow, y_train)
print(sorted(clf.cv_results_.keys()))

# print best parameter after tuning
print(clf.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(clf.best_estimator_)
print(clf.best_score_)

y_pred_knn =  knn.predict(X_test_bow)
knnac =round(accuracy_score(y_test, y_pred_knn), 4)
knnpc =round(precision_score(y_test, y_pred_knn, average = 'micro'), 4)
knnrc=round(recall_score(y_test, y_pred_knn, average = 'micro'), 4)
knnf=round(f1_score(y_test, y_pred_knn, average = 'micro'), 4)
print('kNN Classifier:')
print('Accuracy:', knnac)
print('Precision:', knnpc)
print('Recall:', knnrc)
print('F1-score:', knnf)
print()

              precision    recall  f1-score   support

         neg       0.69      0.42      0.52       385
         pos       0.61      0.83      0.70       415

    accuracy                           0.63       800
   macro avg       0.65      0.62      0.61       800
weighted avg       0.65      0.63      0.61       800

['mean_fit_time', 'mean_score_time', 'mean_test_score', 'param_n_neighbors', 'param_p', 'params', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'std_fit_time', 'std_score_time', 'std_test_score']
{'n_neighbors': 7, 'p': 2}
KNeighborsClassifier(n_neighbors=7)
0.6283333333333333
kNN Classifier:
Accuracy: 0.6312
Precision: 0.6312
Recall: 0.6312
F1-score: 0.6312



*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [27]:
#code
clf_lr = LogisticRegression()
clf_lr.fit(X_train_bow, y_train)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'max_iter': [100, 500, 1000, 2000]
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, refit= True)
grid_search.fit(X_train_bow, y_train)
sorted(grid_search.cv_results_.keys())

# print best parameter after tuning
print(grid_search.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_search.best_estimator_)
print(clf.best_estimator_)
print(clf.best_score_)

y_pred_lr = clf_lr.predict(X_test_bow)
lrac = round(accuracy_score(y_test, y_pred_lr), 2)
lrpc = round(precision_score(y_test, y_pred_lr, average = 'micro'), 2)
lrrc= round(recall_score(y_test, y_pred_lr, average = 'micro'), 2)
lrf= round(f1_score(y_test, y_pred_lr, average = 'micro'), 2)
print('Logistic Regression Classifier:')
print('Accuracy:', lrac)
print('Precision:', lrpc)
print('Recall:', lrrc)
print('F1-score:', lrf)
print()

{'C': 10, 'max_iter': 100}
LogisticRegression(C=10)
KNeighborsClassifier(n_neighbors=7)
0.6283333333333333
Logistic Regression Classifier:
Accuracy: 0.79
Precision: 0.79
Recall: 0.79
F1-score: 0.79



*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [28]:
#code
table3 = PrettyTable()
table3.field_names = ["Classification algorithm", "Accuracy", "Precision", "Recall", "F1-score"]
table3.add_row(["Random Forest", rfac, rfpc, rfrc, rff])
table3.add_row(["KNN", knnac, knnpc, knnrc, knnf])
table3.add_row(["SVM",svmac, svmpc, svmrc, svmf ])
table3.add_row(["LogisticRegression",lrac, lrpc, lrrc, lrf ])
table3.title = 'Results using GridSearchCV'
print(table3)

+--------------------------+----------+-----------+--------+----------+
| Classification algorithm | Accuracy | Precision | Recall | F1-score |
+--------------------------+----------+-----------+--------+----------+
|      Random Forest       |  0.7675  |   0.7675  | 0.7675 |  0.7675  |
|           KNN            |  0.6312  |   0.6312  | 0.6312 |  0.6312  |
|           SVM            |   0.8    |    0.8    |  0.8   |   0.8    |
|    LogisticRegression    |   0.79   |    0.79   |  0.79  |   0.79   |
+--------------------------+----------+-----------+--------+----------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.