# NLP_Assignment2_VarunVaddi_2347481

In [1]:
# Importing all the required libraries
import os
import pandas as pd
import spacy
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import KFold
print("Current working directory:", os.getcwd())

Current working directory: /Users/varunvaddi/Desktop/NLP/Assignment2_TextClassification


In [2]:
# Load the dataset from tsv file
data = pd.read_table('moviereviews.tsv')

# Display the first few rows
data.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
data.shape

(2000, 2)

#### From the above output, we can observe that the data contains 2000 rows and 2 columns

## Data Cleanup & 'label' column Mapping

In [4]:
# Remove rows with missing reviews
data = data.dropna(subset=['review'])

# Convert labels to numerical values (0 for negative, 1 for positive)
data['label'] = data['label'].map({'neg': 0, 'pos': 1})
data.shape

(1965, 2)

#### From the above output, we can observe that the no of rows reduced from 2000 to 1965, i.e., 35 rows are dropped as part of cleaning process

In [5]:
# Displaying the data after encoding label column values to 0 & 1
data.head()

Unnamed: 0,label,review
0,0,how do films like mouse hunt get into theatres...
1,0,some talented actresses are blessed with a dem...
2,1,this has been an extraordinary year for austra...
3,1,according to hollywood movies made in last few...
4,0,my first press screening of 1998 and already i...


## Data Preprocessing function

In [6]:
# Load the SpaCy Large model
nlp = spacy.load('en_core_web_lg')

def preprocess_text(text, lemmatize_words, remove_stop_words, handle_logical_negation):
    # Process text with spaCy
    doc = nlp(text)
    
    # Token processing
    tokens = []
    for token in doc:
        if token.is_punct or token.is_space:
            continue
        if remove_stop_words and token.is_stop:
            continue
        if lemmatize_words:
            token = token.lemma_
        else:
            token = token.text
        tokens.append(token)
    
    # Join tokens into a single string
    processed_text = ' '.join(tokens)

    if remove_stop_words:
        # Remove stop words - from, of, in, he/she
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        processed_text = ' '.join(token for token in tokens if token.lower() not in stop_words)

    if lemmatize_words:
        # lemmatize
        processed_text = ' '.join(token.lemma_ for token in doc)
    
    if handle_logical_negation and lemmatize_words:
        # Replace "not" and "n't" with a placeholder for negation
        processed_text = re.sub(r'\bnot\b|\b(?!\bnot\b)\w+\'?nt\b', 'NEG', processed_text)
    
    return processed_text


## Function to Train and evaluate the model

In [7]:
def train_and_evaluate(X_train, y_train, X_test, y_test):
    # Vectorize the text data
    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    
    # Train Na√Øve Bayes classifier
    model = MultinomialNB()
    model.fit(X_train_vectorized, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_vectorized)
    
    # Print classification report
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

## Function to run 4 different scenarios

In [8]:
def run_scenario(lemmatize_words, remove_stop_words, handle_logical_negation):
    # Preprocess reviews
    data['processed_review'] = data['review'].apply(
        lambda x: preprocess_text(x, lemmatize_words, remove_stop_words, handle_logical_negation)
    )
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        data['processed_review'], data['label'], test_size=0.2, random_state=42
    )
    
    # Train and evaluate the model
    train_and_evaluate(X_train, y_train, X_test, y_test)


## Calling the run_scenario function, by changing the values of parameters

In [9]:
# Scenario 1: No lemmatization with stop word removal, no logical negation
print("Scenario 1: No lemmatization, with stop word removal, no logical negation \n")
run_scenario(lemmatize_words=False, remove_stop_words=True, handle_logical_negation=False)
print("-------------------------------------------------------------------------------------------------------")

# Scenario 2: With lemmatization, no stop word removal, no logical negation
print("\nScenario 2: With lemmatization, no stop word removal, no logical negation \n")
run_scenario(lemmatize_words=True, remove_stop_words=False, handle_logical_negation=False)
print("-------------------------------------------------------------------------------------------------------")

# Scenario 3: With lemmatization, with stop word removal, no logical negation
print("\nScenario 3: With lemmatization, with stop word removal, no logical negation \n")
run_scenario(lemmatize_words=True, remove_stop_words=True, handle_logical_negation=False)
print("-------------------------------------------------------------------------------------------------------")

# Scenario 4: With lemmatization, with stop word removal, and handling logical negation
print("\nScenario 4: With lemmatization, with stop word removal, and handling logical negation \n")
run_scenario(lemmatize_words=True, remove_stop_words=True, handle_logical_negation=True)
print("-------------------------------------------------------------------------------------------------------")


Scenario 1: No lemmatization, with stop word removal, no logical negation 

              precision    recall  f1-score   support

    Negative       0.78      0.85      0.82       202
    Positive       0.83      0.75      0.79       191

    accuracy                           0.80       393
   macro avg       0.80      0.80      0.80       393
weighted avg       0.80      0.80      0.80       393

-------------------------------------------------------------------------------------------------------

Scenario 2: With lemmatization, no stop word removal, no logical negation 

              precision    recall  f1-score   support

    Negative       0.77      0.84      0.80       202
    Positive       0.81      0.74      0.77       191

    accuracy                           0.79       393
   macro avg       0.79      0.79      0.79       393
weighted avg       0.79      0.79      0.79       393

-----------------------------------------------------------------------------------------

### Summary of Results

1. **Scenario 1**: No lemmatization, with stop word removal, no logical negation
   - **Accuracy**: 80%


2. **Scenario 2**: With lemmatization, no stop word removal, no logical negation
   - **Accuracy**: 79%

3. **Scenario 3**: With lemmatization, with stop word removal, no logical negation
   - **Accuracy**: 80%

4. **Scenario 4**: With lemmatization, with stop word removal, and handling logical negation
   - **Accuracy**: 80%