In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import sklearn
import os 

In [44]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

tr_text_list = x_train_df['text'].values.tolist()

# Flatten y_train to (n,) in order to supress scikit-learn warnings
y_train = y_train_df.values.ravel()
print(y_train.shape)

(2400,)


<font size="10">Preprocessing steps</font>

In [2]:
# !pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hCollecting click (from nltk)
  Obtaining dependency information for click from https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl.metadata
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Obtaining dependency information for regex>=2021.8.3 from https://files.pythonhosted.org/packages/8f/3e/4b8b40eb3c80aeaf360f0361d956d129bb3d23b2a3ecbe3a04a8f3bdd6d3/regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.8 MB/s[0m eta [36m

In [39]:
# Import Dependencies
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/joseph/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/joseph/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
def preprocess_basic(sentences):
    ''' Basic Preprocessing Tokenizer that only removes punctuations and uppercases'''
    preprocessed_sentences = []

    for sentence in sentences:
        # Tokenize the sentence
        tokens = word_tokenize(sentence)

        # Remove punctuations
        filtered_tokens = [word.lower() for word in tokens if word.isalpha()]

        # Join the tokens back into a single string
        preprocessed_sentence = ' '.join(filtered_tokens)
        preprocessed_sentences.append(preprocessed_sentence)

    return preprocessed_sentences

In [42]:
def preprocess_stopwords(sentences):
    '''Basic Preprocessing Tokenizer that aslo removes stopwords'''
    preprocessed_sentences = []
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    for sentence in sentences:
        # Tokenize the sentence
        tokens = word_tokenize(sentence)

        # Remove stopwords and punctuation
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
        
        # Stemming (optional)
        # stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        # Join the tokens back into a single string
        preprocessed_sentence = ' '.join(filtered_tokens)
        preprocessed_sentences.append(preprocessed_sentence)

    return preprocessed_sentences

<font size='10'>(Basic Preprocessing) Evaluate # Folds for CV</font>

In [55]:
# Import Dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [52]:
processed_reviews = preprocess_basic(tr_text_list)

In [54]:
# Create a Vectorizer Object
vectorizer = CountVectorizer()
 
vectorizer.fit(processed_reviews)
 
# Print the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the processed reviews
x_train_processed = vectorizer.transform(processed_reviews)
 
# Summarizing the Encoded Texts
print("Encoded Document is:")
print(x_train_processed.toarray())

Encoded Document is:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [56]:
# Create Logistic Regression model
model = LogisticRegression()

# Define scoring function based on scikitlearn cross_val_score
def kfold_scores(model, x_tr, y_tr, cv=3, scoring='accuracy'):
    scores = cross_val_score(model, x_tr, y_tr, cv=cv, scoring=scoring)

    # Print the accuracy scores for each fold
    print(f'For Folds = {cv}')
    for i, score in enumerate(scores):
        print(f'Fold {i+1} - ROC_AUC score: {score:.2f}')
        
    # Calculate and print the mean accuracy and standard deviation
    mean_accuracy = scores.mean()
    std_accuracy = scores.std()
    print(f'Mean ROC_AUC score: {mean_accuracy:.2f} \nStandard Deviation: {std_accuracy:.2f}')

In [48]:
scores_k03 = kfold_scores(model, x_train_processed, y_train, cv=3, scoring='roc_auc')

For Folds = 3
Fold 1 - ROC_AUC score: 0.84
Fold 2 - ROC_AUC score: 0.76
Fold 3 - ROC_AUC score: 0.83
Mean ROC_AUC score: 0.81
Standard Deviation: 0.04



In [49]:
scores_k05 = kfold_scores(model, x_train_processed, y_train, cv=5, scoring='roc_auc')

For Folds = 5
Fold 1 - ROC_AUC score: 0.90
Fold 2 - ROC_AUC score: 0.88
Fold 3 - ROC_AUC score: 0.83
Fold 4 - ROC_AUC score: 0.84
Fold 5 - ROC_AUC score: 0.89
Mean ROC_AUC score: 0.87
Standard Deviation: 0.03



In [50]:
scores_k10 = kfold_scores(model, x_train_processed, y_train, cv=10, scoring='roc_auc')

For Folds = 10
Fold 1 - ROC_AUC score: 0.90
Fold 2 - ROC_AUC score: 0.93
Fold 3 - ROC_AUC score: 0.86
Fold 4 - ROC_AUC score: 0.91
Fold 5 - ROC_AUC score: 0.83
Fold 6 - ROC_AUC score: 0.85
Fold 7 - ROC_AUC score: 0.83
Fold 8 - ROC_AUC score: 0.88
Fold 9 - ROC_AUC score: 0.89
Fold 10 - ROC_AUC score: 0.91
Mean ROC_AUC score: 0.88
Standard Deviation: 0.03



In [51]:
scores_k15 = kfold_scores(model, x_train_processed, y_train, cv=15, scoring='roc_auc')

For Folds = 15
Fold 1 - ROC_AUC score: 0.89
Fold 2 - ROC_AUC score: 0.93
Fold 3 - ROC_AUC score: 0.92
Fold 4 - ROC_AUC score: 0.86
Fold 5 - ROC_AUC score: 0.91
Fold 6 - ROC_AUC score: 0.88
Fold 7 - ROC_AUC score: 0.83
Fold 8 - ROC_AUC score: 0.84
Fold 9 - ROC_AUC score: 0.86
Fold 10 - ROC_AUC score: 0.80
Fold 11 - ROC_AUC score: 0.89
Fold 12 - ROC_AUC score: 0.88
Fold 13 - ROC_AUC score: 0.90
Fold 14 - ROC_AUC score: 0.93
Fold 15 - ROC_AUC score: 0.90
Mean ROC_AUC score: 0.88
Standard Deviation: 0.04



k = 10 seems sufficient (highest score, lowest std dev.)