In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
#Set Random seed
np.random.seed(500)

In [3]:
# Add the Data using pandas
Corpus = pd.read_parquet('/Users/xiomara/Desktop/BSAN 6070/Teaching Workshop/test-00000-of-00001.parquet')

In [4]:
# Step - 1: Data Pre-processing - This will help in getting better results through the classification algorithms

# Step - 1a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)

# Step - 1b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]

# Step - 1c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]

In [5]:
# Step - 1d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [6]:
# Initialize WordNetLemmatizer outside the loop
word_Lemmatized = WordNetLemmatizer()

# Convert stopwords to a set for faster lookup
stop_words = set(stopwords.words('english'))

# Function to process text
def process_text(entry):
    final_words = []
        # Ensure entry is a string and split into words
    for word, tag in pos_tag(entry):  
            # Check for stop words and only alphabets
        if word not in stop_words and word.isalpha():
            word_final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)
        # Return the processed text as a string
    return ' '.join(final_words)
    
# Apply the process_text function to handle text processing
Corpus['text_final'] = Corpus['text'].apply(process_text)

# Print the first few entries of the processed text
print(Corpus.head())

                                                text  label  \
0  [i, love, sci-fi, and, am, willing, to, put, u...      0   
1  [worth, the, entertainment, value, of, a, rent...      0   
2  [its, a, totally, average, film, with, a, few,...      0   
3  [star, rating, :, *, *, *, *, *, saturday, nig...      0   
4  [first, off, let, me, say, ,, if, you, have, n...      0   

                                          text_final  
0  love willing put lot usually underfunded misun...  
1  worth entertainment value rental especially li...  
2  totally average film action sequence make plot...  
3  star rating saturday night friday night friday...  
4  first let say enjoy van damme movie since bloo...  


In [7]:
# Step - 2: Split the model into Train and Test Data set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)


In [8]:
# Step - 3: Label encode the target variable  - This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [9]:
# Convert the lists of words to strings by joining them with spaces for Train_X and Test_X
Train_X = Train_X.apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
Test_X = Test_X.apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Create a TfidfVectorizer with a maximum of 5000 features
Tfidf_vect = TfidfVectorizer(max_features=5000)

# Fit the TF-IDF vectorizer on the training data
Tfidf_vect.fit(Train_X)

# Transform the training and test data
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


In [10]:
# Step - 5: Now we can run different algorithms to classify out data check for accuracy

# Classifier - Algorithm - Naive Bayes
# fit the training dataset on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  85.52
