# Oppenheimer or Barbie?

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
import keras

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from keras.optimizers import Adam
from keras.layers import Dense, Input
from keras.optimizers import SGD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

## Data

In [None]:
df_bb = pd.read_csv('./data/imdb_barbie_Uncleaned.csv', header=None)
df_op = pd.read_csv('./data/imdb_oppenhimmer_Uncleaned.csv', header=None)
df_bb.drop_duplicates(inplace=True)
df_op.drop_duplicates(inplace=True)

## Data Cleaning

In [None]:
import re

score_Reg = '(\d{1,2})\/\d{1,2}\s.*\s'
title_Reg = '\d{1,2}\/\d{1,2}\s(.*)'
people_found_helpful_Reg = '([\d,]*) out of ([\d,]*) found this helpful'
username_Reg = '\d{1,2}\/\d{1,2}\s.*\s([A-Za-z0-9_-]+)\d{2}\s(?:January|February|March|April|June|July|August|September|October|November|December)'
date_Reg = '(\d{1,2}\s(?:January|February|March|April|June|July|August|September|October|November|December)\s\d{4})'
review_Reg = '\d{1,2}\s\w+\s\d{4}\s([\s\S]*)\s[\d,]* out of [\d,]* found this helpful'

In [None]:
# Clean every row in data set

def get_cleaned_df(dirty_df):
    # cleaned_df = pd.DataFrame(columns=['score', 'title', 'username', 'date', 'people_found_helpful', 'review'])
    cleaned_df = pd.DataFrame(columns=['score', 'title', 'username', 'date', 'people_found_helpful', 'total_people_viewed', 'review'])

    for index in range(len(dirty_df)):
        row = dirty_df.iloc[index][0]

        score = re.findall(score_Reg, row)
        title = re.findall(title_Reg, row)
        username = re.findall(username_Reg, row)
        date = re.findall(date_Reg, row)
        people_found_helpful = re.findall(people_found_helpful_Reg, row)
        review = re.findall(review_Reg, row)

        if not score:
            continue
        else:
            score = int(score[0])
        if not title:
            title = ['NULL']
        if not username:
            username = ['NULL']
        if not date:
            date = ['NULL']
        if not people_found_helpful:
            people_found_helpful = 'NULL'
            total_people_viewed = 'NULL'
        else:
            total_people_viewed = int(people_found_helpful[0][1].replace(',', ''))
            people_found_helpful = int(people_found_helpful[0][0].replace(',', ''))
        if not review:
            review = ['NULL']

        cleaned_df.loc[index] = [score, title[0], username[0], date[0], people_found_helpful, total_people_viewed, review[0]]
    return cleaned_df


In [None]:
cleand_df_bb = get_cleaned_df(df_bb)
cleand_df_bb.drop_duplicates(subset=['username'], inplace=True)

In [None]:
cleand_df_op = get_cleaned_df(df_op)
cleand_df_op.drop_duplicates(subset=['username'], inplace=True)

In [None]:
cleand_df_bb

In [None]:
cleand_df_op

In [None]:
cleand_df_bb['isBarbie'] = 1
cleand_df_op['isBarbie'] = 0

df = pd.concat([cleand_df_bb, cleand_df_op], ignore_index=True)

## Data Exploration

In [None]:
# Creating graphable data
# df = df[~df['date'].str.contains('IMAX')]
# numeric_df = pd.DataFrame(columns=['isBarbie', 'score', 'people_found_helpful', 'total_people_viewed', 'helpful_ratio', 'date', 'date_from_release','length_of_title', 'length_of_username', 'length_of_review'])
numeric_df = pd.DataFrame(columns=['isBarbie', 'score', 'date', 'people_found_helpful', 'total_people_viewed', 'helpful_ratio','length_of_title', 'length_of_username', 'length_of_review'])

numeric_df['isBarbie'] = df['isBarbie']
numeric_df['score'] = df['score']
numeric_df['date'] = df['date']
numeric_df['total_people_viewed'] = df['total_people_viewed']
numeric_df['people_found_helpful'] = df['people_found_helpful']
numeric_df['helpful_ratio'] = df['people_found_helpful'] / df['total_people_viewed']
numeric_df['length_of_title'] = df['title'].apply(lambda x: len(x))
numeric_df['length_of_username'] = df['username'].apply(lambda x: len(x))
numeric_df['length_of_review'] = df['review'].apply(lambda x: len(x))

# numeric_df = numeric_df[~numeric_df['date'].str.contains('IMAX')]
numeric_df['date'] = pd.to_datetime(df['date'], format='%d %B %Y')
numeric_df['date_from_release'] = (numeric_df['date'] - pd.to_datetime('2023-07-23', format='%Y-%m-%d')).dt.days

In [None]:
numeric_df

In [None]:
# Coorelation Matrix

drop_na_df = numeric_df[numeric_df['score'] != 'NULL']

fig1 = plt.figure(figsize=(10, 10))
ax1 = fig1.add_subplot(111)

cmap = sns.diverging_palette(0, 255, n=256, as_cmap=True)

sns.heatmap(data=drop_na_df.corr(), ax=ax1, cmap=cmap, annot=True)

plt.show()

In [None]:
fig2 = plt.figure(figsize=(10, 10))
ax2 = fig2.add_subplot(111)

sns.boxplot(data=numeric_df, x='isBarbie', y='date_from_release', ax=ax2)

plt.show()

In [None]:
group_by = df.groupby('isBarbie')
for movie_id, group_data in group_by:
    plt.figure(figsize=(8, 8))
    plt.hist(group_data['score'], edgecolor='black')
    plt.title(f'Score Distribution for isBarbie: {movie_id}')
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
textdata = df[['review', 'title', 'isBarbie']]

## Modeling

### BERT

In [None]:
X_train, X_test, y_train, y_test = train_test_split(textdata.drop(columns = ['isBarbie', 'title']), textdata['isBarbie'], stratify=textdata['isBarbie'], test_size = 0.2, random_state = 42)

In [None]:
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

In [None]:
# BERT layers
text_input = Input(shape=(), dtype=tf.string)
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = Dense(1, activation='sigmoid')(l)

# Use inputs and outputs to construct a final model
bert_model = tf.keras.Model(inputs=[text_input], outputs = [l])

bert_model.compile(optimizer = Adam(learning_rate = 0.2), loss = 'binary_crossentropy', metrics = ['accuracy'])

bert_history = bert_model.fit(X_train, y_train, batch_size = 512, epochs = 1)

In [None]:
bert_pred = bert_model.predict(X_test)

### Bag of Words

In [None]:
textdata = df[['review', 'isBarbie']]
textdata

In [None]:
reviews = textdata["review"]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(reviews)

bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

final_df = pd.concat([bow_df, textdata["isBarbie"]], axis=1)
# final_df = pd.concat([bow_df.reset_index(), textdata["isBarbie"].reset_index()], axis=1)
# final_df.dropna(inplace=True)

In [None]:
X = final_df.drop("isBarbie", axis=1)
y = final_df["isBarbie"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)

clf.fit(X_train, y_train)

### ANN

In [None]:
print(len(df['review']))
# print(df['review'][2185])

In [None]:
# Calculates the number of instances the word "Oppenheimer" was used
opp_inst = []
from collections import Counter
for i in range(len(df['review'])):
    count = Counter(df['review'][i].split())
    amount = count['Oppenheimer'] + count['oppenheimer']
    opp_inst.append(amount)

# Calculates the number of instances the word "Barbie" was used
barbie_inst = []

for i in range(len(df['review'])):
    count = Counter(df['review'][i].split())
    amount = count['Barbie'] + count['barbie']
    barbie_inst.append(amount)

# Calculates the number of instances the word "movie" was used
movie_inst = []

for i in range(len(df['review'])):
    count = Counter(df['review'][i].split())
    amount = count['Movie'] + count['movie']
    movie_inst.append(amount)

# Calculates the number of instances an exclamation mark was used
excl_inst = []
from collections import Counter
for i in range(len(df['review'])):
    count = Counter(df['review'][i])
    amount = count['!']
    excl_inst.append(amount)

new_df = pd.DataFrame()
new_df['opp_inst'] = opp_inst
new_df['barbie_inst'] = barbie_inst
new_df['movie_inst'] = movie_inst
new_df['excl_inst'] = excl_inst
new_df['score'] = numeric_df['score']
new_df['date_from_release'] = numeric_df['date_from_release']
new_df['isBarbie'] = numeric_df['isBarbie']

In [None]:
# Shuffles samples
new_df = new_df.sample(frac = 1)
new_df

In [None]:
x_data = new_df.iloc[:,:6]

In [None]:
scaler = MinMaxScaler()
normalized_inputs = pd.DataFrame(scaler.fit_transform(x_data), columns = x_data.columns)
normalized_inputs

In [None]:
y_data = new_df['isBarbie']

In [None]:
classifier = Sequential() # Initialising the ANN

classifier.add(Dense(units = 16, activation = 'sigmoid', input_dim = 6))
classifier.add(Dense(units = 8, activation = 'sigmoid'))
classifier.add(Dense(units = 4, activation = 'sigmoid'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-03)
classifier.compile(optimizer = opt,loss = 'binary_crossentropy')

In [None]:
x_train, x_test = train_test_split(normalized_inputs, test_size=0.1, random_state=36)
y_train, y_test = train_test_split(y_data, test_size=0.1, random_state=36)

In [None]:
history = classifier.fit(x_train, y_train, batch_size = 30, epochs = 100)

## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier, StackingRegressor 