<a href="https://colab.research.google.com/github/thutrieu123/MachineLearning/blob/main/Lab_8_20130422_TrieuAnhThu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task. 

*   **Deadline: 23:59, 17/4/2023**



# Import libraries

In [29]:
# code
from sklearn import datasets
from sklearn import svm
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from prettytable import PrettyTable
from sklearn.linear_model import LogisticRegression

#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [12]:
#code
iris = datasets.load_iris(as_frame=True)
X = iris['data']
y = iris['target']

In [15]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [16]:
svm = svm.SVC(kernel='linear', random_state=0)
grid_svm = GridSearchCV(estimator = svm, param_grid = param_grid, scoring='accuracy', n_jobs = 4, cv=10, refit = True, return_train_score=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, train_size= 0.7,random_state = 1)

grid_svm.fit(X_train, y_train)
y_predict = grid_svm.predict(X_test)

aces_svm = metrics.accuracy_score(y_test, y_predict)
precision_svm = metrics.precision_score(y_test, y_predict, average='macro')
recall_svm = metrics.recall_score(y_test, y_predict, average='macro')
f1_svm = metrics.f1_score(y_test, y_predict, average='macro')

*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
kNN = KNeighborsClassifier(n_neighbors = 10)
grid_kNN = GridSearchCV(estimator = kNN, param_grid = grid_params, scoring='accuracy', n_jobs = 4, cv=10, refit = True, return_train_score=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, train_size= 0.7,random_state = 1)

grid_kNN.fit(X_train, y_train)
y_predict = grid_kNN.predict(X_test)

aces_kNN = metrics.accuracy_score(y_test, y_predict)
precision_kNN = metrics.precision_score(y_test, y_predict, average='macro')
recall_kNN = metrics.recall_score(y_test, y_predict, average='macro')
f1_kNN = metrics.f1_score(y_test, y_predict, average='macro')

*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
#code
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
rf=RandomForestClassifier(n_estimators=100)

grid_rf = GridSearchCV(estimator = rf, param_grid = param_grid, scoring='accuracy', n_jobs = 4, cv=10, refit = True, return_train_score=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, train_size= 0.7,random_state = 1)

grid_rf.fit(X_train, y_train)
y_predict = grid_rf.predict(X_test)

aces_rf = metrics.accuracy_score(y_test, y_predict)
precision_rf = metrics.precision_score(y_test, y_predict, average='macro')
recall_rf = metrics.recall_score(y_test, y_predict, average='macro')
f1_rf = metrics.f1_score(y_test, y_predict, average='macro')

*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [17]:
table = PrettyTable(["","Accuracy", "Precision", "Recall", "F1"])
table.add_row(["SVM ",round(aces_svm,4),round(precision_svm,4),round(recall_svm,4),round(f1_svm,4)])
table.add_row(["kNN ",round(aces_kNN,4),round(precision_kNN,4),round(recall_kNN,4),round(f1_kNN,4)])
table.add_row(["Random Forest ",round(aces_rf,4),round(precision_rf,4),round(recall_rf,4),round(f1_rf,4)])

print(table)

+----------------+----------+-----------+--------+--------+
|                | Accuracy | Precision | Recall |   F1   |
+----------------+----------+-----------+--------+--------+
|      SVM       |   1.0    |    1.0    |  1.0   |  1.0   |
|      kNN       |  0.9556  |   0.9558  | 0.9558 | 0.9558 |
| Random Forest  |  0.9556  |   0.9558  | 0.9558 | 0.9558 |
+----------------+----------+-----------+--------+--------+


#Task 2. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

In [30]:
cancer = datasets.load_breast_cancer(as_frame=True)
x_task2 = cancer.data
y_task2 = cancer.target

X2_train, X2_test, Y2_train, Y2_test = train_test_split(x_task2,y_task2, test_size = 0.3, train_size=0.7,random_state=1)

*   2.1. Apply **GridSearchCV** to **SVM** 


In [31]:
# code
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf','linear']}

In [None]:
clf_cancer = svm.SVC(kernel='linear', random_state=0)

grid_svm_cancer_class = GridSearchCV(estimator = clf_cancer,param_grid = param_grid_svm,scoring='accuracy',n_jobs = 4,cv =10,refit =True,return_train_score=True)

grid_svm_cancer_class.fit(X2_train, Y2_train)

y_pred_svm_cancer = grid_svm_cancer_class.predict(X2_test)

aces_svm_cancer = round(metrics.accuracy_score(Y2_test, y_pred_svm_cancer),4)
precision_svm_cancer = round(metrics.precision_score(Y2_test, y_pred_svm_cancer, average='macro'),4)
recall_svm_cancer = round(metrics.recall_score(Y2_test, y_pred_svm_cancer, average='macro'),4)
f1_svm_cancer = round(metrics.f1_score(Y2_test, y_pred_svm_cancer, average='macro'),4)

*   2.2. Apply **GridSearchCV** to **kNN** 

In [23]:
#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
kNN_cancer = KNeighborsClassifier(n_neighbors = 10)

grid_kNN_cancer_class = GridSearchCV(   
    estimator = kNN_cancer,
    param_grid = grid_params,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

grid_kNN_cancer_class.fit(X2_train, Y2_train)

y_pred_kNN_cancer = grid_kNN_cancer_class.predict(X2_test)

aces_kNN_cancer = round(metrics.accuracy_score(Y2_test, y_pred_kNN_cancer),4)
precision_kNN_cancer = round(metrics.precision_score(Y2_test, y_pred_kNN_cancer, average='macro'),4)
recall_kNN_cancer = round(metrics.recall_score(Y2_test, y_pred_kNN_cancer, average='macro'),4)
f1_kNN_cancer = round(metrics.f1_score(Y2_test, y_pred_kNN_cancer, average='macro'),4)

*   2.3. Apply **GridSearchCV** to **LogisticRegression** 

In [None]:
#code
param_grid_log = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

In [None]:
classifier_cancer = LogisticRegression(random_state = 0) 

grid_cls_cancer_class = GridSearchCV(   
    estimator = classifier_cancer,
    param_grid = param_grid_log,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

grid_cls_cancer_class.fit(X2_train, Y2_train)

y_pred_cls_cancer = grid_cls_cancer_class.predict(X2_test)

aces_cls_cancer = round(metrics.accuracy_score(Y2_test, y_pred_cls_cancer),4)
precision_cls_cancer = round(metrics.precision_score(Y2_test, y_pred_cls_cancer, average='macro'),4)
recall_cls_cancer = round(metrics.recall_score(Y2_test, y_pred_cls_cancer, average='macro'),4)
f1_cls_cancer = round(metrics.f1_score(Y2_test, y_pred_cls_cancer, average='macro'),4)

*   2.4. Apply **GridSearchCV** to **RandomForest** 

In [None]:
#code
param_grid_3 = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
rdf_cancer = RandomForestClassifier(n_estimators=100)

# Putting the piece
grid_rdf_cancer_class = GridSearchCV(   
    estimator = rdf_cancer ,
    param_grid = param_grid_3 ,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

# Fix the object to our data of knn
grid_rdf_cancer_class.fit(X2_train, Y2_train)

# Make predictions of rdf
y_pred_rdf_cancer = grid_rdf_cancer_class.predict(X2_test)

aces_rdf_cancer = round(metrics.accuracy_score(Y2_test, y_pred_rdf_cancer),4)
precision_rdf_cancer = round(metrics.precision_score(Y2_test, y_pred_rdf_cancer, average='macro'),4)
recall_rdf_cancer = round(metrics.recall_score(Y2_test, y_pred_rdf_cancer, average='macro'),4)
f1_rdf_cancer = round(metrics.f1_score(Y2_test, y_pred_rdf_cancer, average='macro'),4)

*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [None]:
#code
table_2= PrettyTable(["","Accuracy", "Precision", "Recall", "F1"])
table_2.add_row(["Suport Vector Machine ",aces_svm_cancer,precision_svm_cancer,recall_svm_cancer,f1_svm_cancer])
table_2.add_row(["KNN ",aces_kNN_cancer,precision_kNN_cancer,recall_kNN_cancer,f1_kNN_cancer])
table_2.add_row(["Logistic Regression ",aces_cls_cancer,precision_cls_cancer,recall_cls_cancer,f1_cls_cancer])
table_2.add_row(["Random Forest ",aces_rdf_cancer,precision_rdf_cancer,recall_rdf_cancer,f1_rdf_cancer])
print(table_2)

#Task 3. 
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion. 
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   3.1 Importing additional libraries

In [None]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

*   3.2. Movie reviews information

In [None]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

*   3.3. Create dataset from movie reviews

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [None]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

In [None]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

*   3.4. Train test split

In [None]:
train, test = train_test_split(documents, test_size = 0.33, random_state=42)

In [None]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

In [None]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   3.5. Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test

*   3.6. Apply **SVM** with **GridSearchCV** 

In [None]:
#code
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf','linear']}

In [None]:
clf_movie = svm.SVC(kernel='linear', random_state=0)

# Putting the piece
grid_svm_movie_class = GridSearchCV(   
    estimator = clf_movie,
    param_grid = param_grid_svm,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

# Fix the object to our data of svm
grid_svm_movie_class.fit(X_train_bow, y_train)

# Make predictions of svm
y_pred_svm_movie = grid_svm_movie_class.predict(X_test_bow)

aces_svm_movie = round(metrics.accuracy_score(y_test, y_pred_svm_movie),4)
precision_svm_movie = round(metrics.precision_score(y_test, y_pred_svm_movie, average='macro'),4)
recall_svm_movie = round(metrics.recall_score(y_test, y_pred_svm_movie, average='macro'),4)
f1_svm_movie = round(metrics.f1_score(y_test, y_pred_svm_movie, average='macro'),4)

*   3.7. Apply **RandomForest** with **GridSearchCV** 

In [None]:
#code
param_grid_3 = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [None]:
rdf_movie = RandomForestClassifier(n_estimators=100)

# Putting the piece
grid_rdf_movie_class = GridSearchCV(   
    estimator = rdf_movie ,
    param_grid = param_grid_3 ,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

# Fix the object to our data of knn
grid_rdf_movie_class.fit(X_train_bow, Y_train)

# Make predictions of rdf
y_pred_rdf_movie = grid_rdf_movie_class.predict(X_test_bow)

aces_rdf_movie = round(metrics.accuracy_score(Y_test, y_pred_rdf_movie),4)
precision_rdf_movie = round(metrics.precision_score(Y_test, y_pred_rdf_movie, average='macro'),4)
recall_rdf_movie = round(metrics.recall_score(Y_test, y_pred_rdf_movie, average='macro'),4)
f1_rdf_movie = round(metrics.f1_score(Y_test, y_pred_rdf_movie, average='macro'),4)

*   3.8. Apply **kNN** with **GridSearchCV** 

In [None]:
#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
kNN_movie = KNeighborsClassifier(n_neighbors = 10)

# Putting the piece
grid_kNN_movie_class = GridSearchCV(   
    estimator = kNN_movie,
    param_grid = grid_params,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

# Fix the object to our data of kNN_movie
grid_kNN_movie_class.fit(X_train_bow, y_train)

# Make predictions of kNN_movie
y_pred_kNN_movie = grid_kNN_movie_class.predict(X_test_bow)

aces_kNN_movie = round(metrics.accuracy_score(y_test, y_pred_kNN_movie),4)
precision_kNN_movie = round(metrics.precision_score(y_test, y_pred_kNN_movie, average='macro'),4)
recall_kNN_movie = round(metrics.recall_score(y_test, y_pred_kNN_movie, average='macro'),4)
f1_kNN_movie = round(metrics.f1_score(y_test, y_pred_kNN_movie, average='macro'),4)

*   3.9. Apply **LogisticRegression** with **GridSearchCV** 

In [None]:
#code
param_grid_log = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

In [None]:
classifier_movie = LogisticRegression(random_state = 0) 

# Putting the piece
grid_cls_movie_class = GridSearchCV(   
    estimator = classifier_movie,
    param_grid = param_grid_log,
    scoring='accuracy',
    n_jobs = 4,
    cv =10,
    refit =True,
    return_train_score=True)

# Fix the object to our data of knn
grid_cls_movie_class.fit(X_train_bow, Y2_train)

# Make predictions of cls
y_pred_cls_movie = grid_cls_movie_class.predict(X_test_bow)

aces_cls_movie = round(metrics.accuracy_score(Y_test, y_pred_cls_movie),4)
precision_cls_movie = round(metrics.precision_score(Y_test, y_pred_cls_movie, average='macro'),4)
recall_cls_movie = round(metrics.recall_score(Y_test, y_pred_cls_movie, average='macro'),4)
f1_cls_movie = round(metrics.f1_score(Yasaa_test, y_pred_cls_movie, average='macro'),4)

*   3.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results) 

In [None]:
table_3= PrettyTable(["","Accuracy", "Precision", "Recall", "F1"])
table_3.add_row(["Suport Vector Machine ",aces_svm_movie,precision_svm_movie,recall_svm_movie,f1_svm_movie])
table_3.add_row(["KNN ",aces_kNN_movie,precision_kNN_movie,recall_kNN_movie,f1_kNN_movie])
table_3.add_row(["Logistic Regression ",aces_cls_movie,precision_cls_movie,recall_cls_movie,f1_cls_movie])
table_3.add_row(["Random Forest ",aces_rdf_movie,precision_rdf_movie,recall_rdf_movie,f1_rdf_movie])
print(table_3)

#Finally,
Save a copy in your Github. Remember renaming the notebook.