<a href="https://colab.research.google.com/github/tranbuituanngoc/ML/blob/main/Lab_8_20130337_TranBuiTuanNgoc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from prettytable import PrettyTable
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [None]:
iris = datasets.load_iris()

In [None]:
X= iris['data']
Y= iris['target']
clf=svm.SVC()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7,  random_state = 1)

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

grid_svm_class= GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', n_jobs=4,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

SVC(C=1, gamma=1, kernel='linear')
11
{'C': 1, 'gamma': 1, 'kernel': 'linear'}
0.9809090909090908


In [None]:
kncm=confusion_matrix(y_test, y_pred)
knas=round(accuracy_score(y_test, y_pred),2)
knps=round(precision_score(y_test, y_pred, average='micro'),2)
knrs=round(recall_score(y_test, y_pred, average='micro'),2)
knfs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[14  0  0]
 [ 0 18  0]
 [ 0  0 13]]
Accuracy :  1.0
Precision :  1.0
Recall :  1.0
F1 :  1.0


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
clfKNN = KNeighborsClassifier()
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid_svm_class= GridSearchCV(estimator=clfKNN, param_grid=grid_params, scoring='accuracy', verbose=1,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
KNeighborsClassifier(n_neighbors=9, weights='distance')
5
{'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'distance'}
0.9709090909090909


In [None]:
knncm=confusion_matrix(y_test, y_pred)
knnas=round(accuracy_score(y_test, y_pred),2)
knnps=round(precision_score(y_test, y_pred, average='micro'),2)
knnrs=round(recall_score(y_test, y_pred, average='micro'),2)
knnfs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[14  0  0]
 [ 0 17  1]
 [ 0  1 12]]
Accuracy :  0.9555555555555556
Precision :  0.9555555555555556
Recall :  0.9555555555555556
F1 :  0.9555555555555556


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
clfRF = RandomForestClassifier(criterion='entropy', max_features='auto')
gparam_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid_svm_class= GridSearchCV(estimator=clfRF, param_grid=gparam_grid, scoring='accuracy', n_jobs=4,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features=None,
                       max_leaf_nodes=6, n_estimators=50)
65
{'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 6, 'n_estimators': 50}
0.9609090909090909


In [None]:
sicm=confusion_matrix(y_test, y_pred)
rfas=round(accuracy_score(y_test, y_pred),2)
rfps=round(precision_score(y_test, y_pred, average='micro'),2)
rfrs=round(recall_score(y_test, y_pred, average='micro'),2)
rffs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[14  0  0]
 [ 0 17  1]
 [ 0  1 12]]
Accuracy :  0.9555555555555556
Precision :  0.9555555555555556
Recall :  0.9555555555555556
F1 :  0.9555555555555556


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [None]:
t= PrettyTable(['Clasicfical Algorithm', 'Accuracy','Precision','Recall','F1'])
t.add_row(['SVM',knas,knps,knrs,knfs])
t.add_row(['kNN',knnas,knnps,knnrs,knnfs])
t.add_row(['Random Forest',rfas,rfps,rfrs,rffs])
print(t)

+-----------------------+----------+-----------+--------+------+
| Clasicfical Algorithm | Accuracy | Precision | Recall |  F1  |
+-----------------------+----------+-----------+--------+------+
|          SVM          |   1.0    |    1.0    |  1.0   | 1.0  |
|          kNN          |   0.96   |    0.96   |  0.96  | 0.96 |
|     Random Forest     |   0.96   |    0.96   |  0.96  | 0.96 |
+-----------------------+----------+-----------+--------+------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
cancer = datasets.load_breast_cancer()

*   2.1. Apply **GridSearchCV** to **SVM** 


In [None]:
X= cancer['data']
Y= cancer['target']
clf=svm.SVC(kernel="sigmoid", random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7,  random_state = 1)

In [None]:
# Giảm kích thước tập test để giúp code có thể chạy
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, Y, test_size=0.7,  random_state = 1)
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

grid_svm_class= GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', n_jobs=4,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train2,y_train2)
y_pred= grid_svm_class.predict(X_test2)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

SVC(C=0.1, gamma=1, kernel='linear', random_state=0)
1
{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
0.9647058823529411


In [None]:
kncm=confusion_matrix(y_test2, y_pred)
knas=round(accuracy_score(y_test2, y_pred),2)
knps=round(precision_score(y_test2, y_pred, average='micro'),2)
knrs=round(recall_score(y_test2, y_pred, average='micro'),2)
knfs=round(f1_score(y_test2, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test2, y_pred))
print ("Accuracy : ", accuracy_score(y_test2, y_pred))
print ("Precision : ", precision_score(y_test2, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test2, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test2, y_pred, average='micro'))

Confusion Matrix : 
 [[133  17]
 [  9 240]]
Accuracy :  0.9348370927318296
Precision :  0.9348370927318296
Recall :  0.9348370927318296
F1 :  0.9348370927318296


*   2.2. Apply **GridSearchCV** to **kNN** 

In [None]:
clfKNN = KNeighborsClassifier()
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid_svm_class= GridSearchCV(estimator=clfKNN, param_grid=grid_params, scoring='accuracy', verbose=1,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=13)
32
{'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'uniform'}
0.9371153846153847


In [None]:
knncm=confusion_matrix(y_test, y_pred)
knnas=round(accuracy_score(y_test, y_pred),2)
knnps=round(precision_score(y_test, y_pred, average='micro'),2)
knnrs=round(recall_score(y_test, y_pred, average='micro'),2)
knnfs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[ 55   8]
 [  3 105]]
Accuracy :  0.935672514619883
Precision :  0.935672514619883
Recall :  0.935672514619883
F1 :  0.935672514619883


*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [None]:
clfLR = LogisticRegression() 
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
grid_svm_class= GridSearchCV(estimator=clfLR, param_grid=grid, scoring='accuracy', verbose=1,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression(C=1000.0)
13
{'C': 1000.0, 'penalty': 'l2'}
0.9448717948717948


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
70 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packag

In [None]:
lras=round(accuracy_score(y_test, y_pred),2)
lrps=round(precision_score(y_test, y_pred),2)
lrrs=round(recall_score(y_test, y_pred),2)
lrfs=round(f1_score(y_test, y_pred),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[ 57   6]
 [  6 102]]
Accuracy :  0.9298245614035088
Precision :  0.9298245614035088
Recall :  0.9298245614035088
F1 :  0.9298245614035088


*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [None]:
clfRF = RandomForestClassifier(criterion='entropy', max_features='auto')
gparam_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid_svm_class= GridSearchCV(estimator=clfRF, param_grid=gparam_grid, scoring='accuracy', n_jobs=4,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train,y_train)
y_pred= grid_svm_class.predict(X_test)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=6, max_features=None,
                       max_leaf_nodes=9, n_estimators=25)
68
{'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 25}
0.9673717948717948


In [None]:
sicm=confusion_matrix(y_test, y_pred)
rfas=round(accuracy_score(y_test, y_pred),2)
rfps=round(precision_score(y_test, y_pred, average='micro'),2)
rfrs=round(recall_score(y_test, y_pred, average='micro'),2)
rffs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[ 57   6]
 [  5 103]]
Accuracy :  0.935672514619883
Precision :  0.935672514619883
Recall :  0.935672514619883
F1 :  0.935672514619883


*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [None]:
t= PrettyTable(['Clasicfical Algorithm', 'Accuracy','Precision','Recall','F1'])
t.add_row(['SVM',knas,knps,knrs,knfs])
t.add_row(['kNN',knnas,knnps,knnrs,knnfs])
t.add_row(['Random Forest',rfas,rfps,rfrs,rffs])
t.add_row(['LogisticRegression',lras,lrps,lrrs,lrfs])
print(t)

+-----------------------+----------+-----------+--------+------+
| Clasicfical Algorithm | Accuracy | Precision | Recall |  F1  |
+-----------------------+----------+-----------+--------+------+
|          SVM          |   0.93   |    0.93   |  0.93  | 0.93 |
|          kNN          |   0.94   |    0.94   |  0.94  | 0.94 |
|     Random Forest     |   0.94   |    0.94   |  0.94  | 0.94 |
|   LogisticRegression  |   0.93   |    0.94   |  0.94  | 0.94 |
+-----------------------+----------+-----------+--------+------+


#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [None]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   3.2. Movie reviews information

In [None]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   3.3. Create dataset from movie reviews

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [None]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [None]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   3.4. Train test split

In [None]:
train, test = train_test_split(documents, test_size = 0.8, random_state=42)

In [None]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 205, 'pos': 195})
Counter({'pos': 805, 'neg': 795})


In [None]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [None]:
clf=svm.SVC()

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}
grid_svm_class = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', n_jobs=4, cv=10, refit=True, return_train_score=True, error_score='raise')
grid_svm_class.fit(X_train_bow, [l for (d, l) in train])
y_pred = grid_svm_class.predict(X_test_bow)

print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

SVC(C=10, gamma=1, kernel='linear')
21
{'C': 10, 'gamma': 1, 'kernel': 'linear'}
0.8125


In [None]:
kncm=confusion_matrix(y_test, y_pred)
knas=round(accuracy_score(y_test, y_pred),2)
knps=round(precision_score(y_test, y_pred, average='micro'),2)
knrs=round(recall_score(y_test, y_pred, average='micro'),2)
knfs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[618 177]
 [183 622]]
Accuracy :  0.775
Precision :  0.775
Recall :  0.775
F1 :  0.775


*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [None]:
clfRF = RandomForestClassifier(criterion='entropy', max_features='auto')
gparam_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid_svm_class= GridSearchCV(estimator=clfRF, param_grid=gparam_grid, scoring='accuracy', n_jobs=4,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train_bow, [l for (d, l) in train])
y_pred= grid_svm_class.predict(X_test_bow)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

RandomForestClassifier(criterion='entropy', max_depth=6, max_leaf_nodes=9,
                       n_estimators=150)
47
{'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 9, 'n_estimators': 150}
0.7675


In [22]:
sicm=confusion_matrix(y_test, y_pred)
rfas=round(accuracy_score(y_test, y_pred),2)
rfps=round(precision_score(y_test, y_pred, average='micro'),2)
rfrs=round(recall_score(y_test, y_pred, average='micro'),2)
rffs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[635 160]
 [212 593]]
Accuracy :  0.7675
Precision :  0.7675
Recall :  0.7675
F1 :  0.7675


*   3.8. Apply **kNN** with **GridSearchCV** 

In [23]:
clfKNN = KNeighborsClassifier()
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid_svm_class= GridSearchCV(estimator=clfKNN, param_grid=grid_params, scoring='accuracy', verbose=1,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train_bow, [l for (d, l) in train])
y_pred= grid_svm_class.predict(X_test_bow)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=15)
34
{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}
0.615


In [24]:
knncm=confusion_matrix(y_test, y_pred)
knnas=round(accuracy_score(y_test, y_pred),2)
knnps=round(precision_score(y_test, y_pred, average='micro'),2)
knnrs=round(recall_score(y_test, y_pred, average='micro'),2)
knnfs=round(f1_score(y_test, y_pred, average='micro'),2)
print ("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print ("Accuracy : ", accuracy_score(y_test, y_pred))
print ("Precision : ", precision_score(y_test, y_pred, average='micro'))
print ("Recall : ", recall_score(y_test, y_pred, average='micro'))
print ("F1 : ", f1_score(y_test, y_pred, average='micro'))

Confusion Matrix : 
 [[328 467]
 [122 683]]
Accuracy :  0.631875
Precision :  0.631875
Recall :  0.631875
F1 :  0.631875


*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [27]:
clfLR = LogisticRegression()
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
grid_svm_class= GridSearchCV(estimator=clfLR, param_grid=grid, scoring='accuracy', verbose=1,cv=10,refit=True,return_train_score=True)
grid_svm_class.fit(X_train_bow, [label for (words, label) in train])
y_pred= grid_svm_class.predict(X_test_bow)
print(grid_svm_class.best_estimator_)
print(grid_svm_class.best_index_)
print(grid_svm_class.best_params_)
print(grid_svm_class.best_score_)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
LogisticRegression(C=1000.0)
13
{'C': 1000.0, 'penalty': 'l2'}
0.8


70 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

    nan 0.8       nan 0.8   ]
        na

In [28]:
lras=round(accuracy_score([label for (words, label) in test], y_pred),2)
lrps=round(precision_score([label for (words, label) in test], y_pred, average='micro'),2)
lrrs=round(recall_score([label for (words, label) in test], y_pred, average='micro'),2)
lrfs=round(f1_score([label for (words, label) in test], y_pred, average='micro'),2)
print("Confusion Matrix:\n", confusion_matrix([label for (words, label) in test], y_pred))
print("Accuracy:", lras)
print("Precision:", lrps)
print("Recall:", lrrs)
print ("F1 : ", lrfs)

Confusion Matrix:
 [[622 173]
 [181 624]]
Accuracy: 0.78
Precision: 0.78
Recall: 0.78
F1 :  0.78


*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [29]:
t= PrettyTable(['Clasicfical Algorithm', 'Accuracy','Precision','Recall','F1'])
t.add_row(['SVM',knas,knps,knrs,knfs])
t.add_row(['kNN',knnas,knnps,knnrs,knnfs])
t.add_row(['Random Forest',rfas,rfps,rfrs,rffs])
t.add_row(['LogisticRegression',lras,lrps,lrrs,lrfs])
print(t)

+-----------------------+----------+-----------+--------+------+
| Clasicfical Algorithm | Accuracy | Precision | Recall |  F1  |
+-----------------------+----------+-----------+--------+------+
|          SVM          |   0.78   |    0.78   |  0.78  | 0.78 |
|          kNN          |   0.63   |    0.63   |  0.63  | 0.63 |
|     Random Forest     |   0.77   |    0.77   |  0.77  | 0.77 |
|   LogisticRegression  |   0.78   |    0.78   |  0.78  | 0.78 |
+-----------------------+----------+-----------+--------+------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.