<a href="https://colab.research.google.com/github/wooihaw/ERA3036_T2310/blob/main/Chapter_3/Chapter_3_Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Example 1
### Machine Learning Pipeline

In [None]:
# Importing necessary libraries for pipeline creation, model selection, and preprocessing
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split as split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Loading the breast cancer dataset
X, y = load_breast_cancer(return_X_y=True)

# Splitting the dataset into training and test sets, with stratification to ensure representative class distribution
X_train, X_test, y_train, y_test = split(X, y, stratify=y, random_state=42)

# Without pipeline
knn = KNeighborsClassifier().fit(X_train, y_train)
print(f"Without pipeline: {knn.score(X_test, y_test):.3%}")

# Creating a pipeline with a scaler (None initially), feature selector, and classifier
pipe = Pipeline([('scl', None), ('fs', SelectKBest()), ('clf', KNeighborsClassifier())])

# Creating a parameter grid to search for the best parameters for preprocessing and the classifier
params = {}
params['scl'] = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]  # Different scalers
params['fs__k'] = range(10, 20)  # Number of features to select
params['clf'] = [KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier()]  # Classifier options

# Setting up the grid search with cross-validation to find the best parameters
gs = GridSearchCV(pipe, params, cv=5, n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)  # Note: This should be X_train, y_train instead of X_trainval, y_trainval

# Printing the best parameters found by the grid search
print(gs.best_params_)

# Printing the score of the best model on the test set
print(f"With pipeline: {gs.score(X_test, y_test):.3%}")

## Example 2
### Movie Review Sentiment Analysis

In [None]:
# Load movie reviews
import sys
import requests
from io import BytesIO
from zipfile import ZipFile
from sklearn.datasets import load_files


# Send a GET request to the URL
response = requests.get("https://raw.githubusercontent.com/wooihaw/datasets/main/movie_reviews.zip")

# Ensure the request was successful
if response.status_code == 200:
  # Determine the environment
  is_colab = 'google.colab' in sys.modules
  # # Load the dataset conditionally
  if is_colab:
    # Code for Google Colab environment
    moviedir = 'sample_data/movie_reviews'
    zipfile = ZipFile(BytesIO(response.content))
    zipfile.extractall("sample_data/")
  else:
     # Code for local Jupyter Notebook environment
    moviedir = '../data/movie_reviews/'
    zipfile = ZipFile(BytesIO(response.content))
    zipfile.extractall("../data/")
else:
    print("Failed to retrieve the file")

movies = load_files(moviedir, shuffle=True)

In [None]:
# target names ("classes") are automatically generated from subfolder names
print(movies.target_names)

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movies.data, movies.target, test_size = 0.20, random_state = 12)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
movieVzer = CountVectorizer(min_df=2, ngram_range=(1, 2))

# fit and tranform using training text
docs_train_counts = movieVzer.fit_transform(docs_train)

In [None]:
# Convert raw frequency counts into TF-IDF values
from sklearn.feature_extraction.text import TfidfTransformer
movieTfmer = TfidfTransformer(use_idf=True)
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)

In [None]:
# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)

In [None]:
# Load Multinominal Naive Bayes classier from sklearn
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

In [None]:
# Predict the Test set results, find accuracy
clf.score(docs_test_tfidf, y_test)

In [None]:
# Test with short movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride',
               'Tom Hanks was amazing', 'Tom Cruise shone through.',
              'This is a huge letdown', 'Two thumbs up', 'I fell asleep halfway through',
              "Can't wait for the sequel", 'I cannot recommend this highly enough',
              'Instant classic.', 'Tom Hanks performance was Oscar-worthy.',
              'A must-see event for all moviegoers',
               "Endgame isn't a great movie, but there are flashes of greatness in it"]

reviews_new_counts = movieVzer.transform(reviews_new)         # turn text into count vector
reviews_new_tfidf = movieTfmer.transform(reviews_new_counts)  # turn into tfidf vector

In [None]:
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [None]:
# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movies.target_names[category]))