## Loading Libraries

In [20]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from PIL import Image

## Loading Datasets

In [21]:

# Load the dataframe
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## Cleaning The Dataset

In [22]:
# Fill missing values in 'TEXT' column with a placeholder
placeholder = "THE"  # Define your placeholder
train_data['TEXT'].fillna(placeholder, inplace=True)
test_data['TEXT'].fillna(placeholder, inplace=True)

# Function to clean and preprocess the text
def clean_text(text):
    # Placeholder example for cleaning, add your own process
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    text = re.sub(r'<[^>]*>', ' ', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join([word for word in tokens if word not in stop_words])
    return cleaned_text

# Apply the cleaning function to the 'TEXT' column
train_data['CLEAN_TEXT'] = train_data['TEXT'].apply(clean_text)
test_data['CLEAN_TEXT'] = test_data['TEXT'].apply(clean_text)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['TEXT'].fillna(placeholder, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['TEXT'].fillna(placeholder, inplace=True)


## Creating WordClouds

In [28]:


def generate_wordcloud(text, mask_path, output_path):
    mask = np.array(Image.open(mask_path))
    stopwords_set = set(stopwords.words('english'))  # Convert stopwords to a set
    wordcloud = WordCloud(width=800, height=800, background_color='black', mask=mask, contour_width=3, contour_color='steelblue', stopwords=stopwords_set)
    wordcloud.generate(text)
    wordcloud.to_file(output_path)

# Generate word cloud for positive reviews
positive_reviews = ' '.join(train_data[train_data['LABEL'] == 1]['CLEAN_TEXT'])
generate_wordcloud(positive_reviews, 'images/up.png', 'word_clouds/positive_wordcloud.png')

# Generate word cloud for negative reviews
negative_reviews = ' '.join(train_data[train_data['LABEL'] == 2]['CLEAN_TEXT'])
generate_wordcloud(negative_reviews, 'images/down.png', 'word_clouds/negative_wordcloud.png')

# Generate word cloud for non-movie/TV reviews
non_movie_reviews = ' '.join(train_data[train_data['LABEL'] == 0]['CLEAN_TEXT'])
generate_wordcloud(non_movie_reviews, 'images/cat.png', 'word_clouds/non_movie_wordcloud.png')

## Training The Model

In [24]:
# Extract the 'TEXT' column as the corpus
corpus_train = train_data['TEXT'].tolist()
corpus_test = test_data['TEXT'].tolist()

# Extract the label information
raw_labels_train = train_data['LABEL'].tolist()

# Encode the labels
le = LabelEncoder()
y_train = le.fit_transform(raw_labels_train)

# Split the training data into training and validation sets
X_train, X_validation, y_train, y_validation = train_test_split(
    corpus_train, y_train, test_size=0.2, random_state=42)


# Feature extraction (character n-grams)
vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer="word")
X_train = vectorizer.fit_transform(X_train)
X_validation = vectorizer.transform(X_validation)


# Logistic Regression with further tuning
clf = LogisticRegression(max_iter=10000, C=0.1, multi_class='multinomial', solver='lbfgs')

# Train and evaluate the classifier
clf.fit(X_train, y_train)




## Evaluate The Model

In [25]:
# Evaluate the classifier on the training data
train_predictions = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions, average='weighted')
train_recall = recall_score(y_train, train_predictions, average='weighted')
train_f1 = f1_score(y_train, train_predictions, average='weighted')
print("Training Evaluation:")
print("Accuracy:", train_accuracy)
print("Precision:", train_precision)
print("Recall:", train_recall)
print("F1-score:", train_f1)
print()


Training Evaluation:
Accuracy: 0.999182265834711
Precision: 0.9991832701379051
Recall: 0.999182265834711
F1-score: 0.9991823036993919



## Evaluate The Model on The Validation Data


In [26]:
# Evaluate the classifier on the validation data
validation_predictions = clf.predict(X_validation)
validation_accuracy = accuracy_score(y_validation, validation_predictions)
validation_precision = precision_score(y_validation, validation_predictions, average='weighted')
validation_recall = recall_score(y_validation, validation_predictions, average='weighted')
validation_f1 = f1_score(y_validation, validation_predictions, average='weighted')
print("Validation Evaluation:")
print("Accuracy:", validation_accuracy)
print("Precision:", validation_precision)
print("Recall:", validation_recall)
print("F1-score:", validation_f1)
print()


Validation Evaluation:
Accuracy: 0.9262656427758816
Precision: 0.9254382084134495
Recall: 0.9262656427758816
F1-score: 0.9256446309154474



## Creating Submission File

In [27]:
# Make predictions on the test data
X_test = vectorizer.transform(corpus_test)
test_predictions = clf.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create a submission file
submission = pd.DataFrame({'ID': test_data['ID'], 'LABEL': test_predictions_labels})
submission = submission[['ID', 'LABEL']]  # Reorder the columns
submission.to_csv('submission.csv', index=False)