In [44]:
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt_tab')

In [39]:
import pandas as pd
import spacy
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [13]:
print("Current working directory:", os.getcwd())

Current working directory: /Users/varunvaddi/Desktop/NLP/Assignment2_TextClassification


In [46]:
# Read the TSV file into a DataFrame
df_raw = pd.read_table('moviereviews.tsv')

# Display the first few rows of the DataFrame
df_raw.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [4]:
df_raw.shape

(2000, 2)

In [5]:
df_raw.describe()

Unnamed: 0,label,review
count,2000,1965.0
unique,2,1939.0
top,neg,
freq,1000,27.0


### Data Cleanup

In [6]:
df = df_raw.dropna()

In [7]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [8]:
df.shape

(1965, 2)

#### 35 rows dropped for having null values

In [9]:
df.describe()

Unnamed: 0,label,review
count,1965,1965.0
unique,2,1939.0
top,neg,
freq,983,27.0


In [10]:
# Map 'negative' to 0 and 'positive' to 1
df['enco_label'] = df['label'].map({'neg': 0, 'pos': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['enco_label'] = df['label'].map({'neg': 0, 'pos': 1})


In [11]:
df.head()

Unnamed: 0,label,review,enco_label
0,neg,how do films like mouse hunt get into theatres...,0
1,neg,some talented actresses are blessed with a dem...,0
2,pos,this has been an extraordinary year for austra...,1
3,pos,according to hollywood movies made in last few...,1
4,neg,my first press screening of 1998 and already i...,0


### Data Preprocessing

In [15]:
# Load the English model for lemmatization
nlp = spacy.load('en_core_web_lg')

In [27]:
def preprocess(text, lemmatize_words, remove_stop_words, handle_logical_negation):
    # Remove symbols and punctuations
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    if lemmatize_words:
        # Tokenize and lemmatize
        doc = nlp(text)
        text = ' '.join(token.lemma_ for token in doc)
    
    if remove_stop_words:
        # Remove stop words - from, of, in, he/she
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        text = ' '.join(token for token in tokens if token.lower() not in stop_words)
    
    if handle_logical_negation and lemmatize_words:
        # Handle logical negation by replacing 'not' with 'not_' 
        text = re.sub(r'\bnot\b', 'not_', text)
    
    return text

In [30]:
df_subset = df[5:15]

In [32]:
df_subset.describe()

Unnamed: 0,enco_label
count,10.0
mean,0.1
std,0.316228
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [33]:
# Apply the preprocessing function to the 'review' column
df_subset['process_FTF'] = df['review'].apply(preprocess, lemmatize_words=False, remove_stop_words=True, handle_logical_negation=False)
df_subset['process_TFF'] = df['review'].apply(preprocess, lemmatize_words=True, remove_stop_words=False, handle_logical_negation=False)
df_subset['process_TTF'] = df['review'].apply(preprocess, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=False)
df_subset['process_TTT'] = df['review'].apply(preprocess, lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['process_FTF'] = df['review'].apply(preprocess, lemmatize_words=False, remove_stop_words=True, handle_logical_negation=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['process_TFF'] = df['review'].apply(preprocess, lemmatize_words=True, remove_stop_words=False, handle_logical_negation=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-doc

In [34]:
# Display the DataFrame to check the changes
print(df_subset)

   label                                             review  enco_label  \
5    neg  to put it bluntly , ed wood would have been pr...           0   
6    neg  synopsis : melissa , a mentally-disturbed woma...           0   
7    neg  tim robbins and martin lawernce team up in thi...           0   
8    neg  in " gia " , angelina jolie plays the titular ...           0   
9    neg  in 1990 , the surprise success an unheralded l...           0   
10   neg  upon first viewing of this movie , the phrases...           0   
11   pos  with stars like sigourney weaver ( " alien " t...           1   
12   neg   " the 13th warrior " comes at the end of as s...           0   
13   neg  georges polti once wrote a paper called " the ...           0   
14   neg  actually , i'm fairly sure the experience of h...           0   

                                          process_FTF  \
5   put bluntly ed wood would proud totally ridicu...   
6   synopsis melissa mentallydisturbed woman likes...   
7  

# Splitting the dataset

In [37]:
# Define features and target
X = df['process_FTF']
y = df['label']  # assuming the label column contains the classes (e.g., 'positive' or 'negative')

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes Classifier

In [40]:
# Create a CountVectorizer for feature extraction
vectorizer = CountVectorizer()

# Transform the training and testing data
X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)

In [41]:
# Train a Naïve Bayes classifier
model = MultinomialNB(alpha=1.0)  # Laplace smoothing (add 1)
model.fit(X_train_matrix, y_train)

# Testing

In [42]:
# Predict on the test set
y_pred = model.predict(X_test_matrix)

In [43]:
# Evaluate the classifier
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[170  32]
 [ 47 144]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.78      0.84      0.81       202
         pos       0.82      0.75      0.78       191

    accuracy                           0.80       393
   macro avg       0.80      0.80      0.80       393
weighted avg       0.80      0.80      0.80       393

