# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import nltk
import re
import string
import pickle
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from nltk.sentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
from tensorflow.keras.layers import LSTM, Dense , Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from mlxtend.plotting import plot_confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/mr_robot.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

# EDA

In [None]:
episode_rating_sorted = df.sort_values(by="IMDb Rating", ascending=False)[["Episode Title", "IMDb Rating"]]
episode_rating_sorted.head()

In [None]:
plt.figure()
ax = sns.barplot(data=episode_rating_sorted.head(10), y="Episode Title", x="IMDb Rating")
ax.bar_label(ax.containers[0])
ax.set_title("Best 10 Episodes according to IMDB Ratings")
plt.show()

In [None]:
plt.figure()
ax = sns.barplot(data=episode_rating_sorted.tail(10), y="Episode Title", x="IMDb Rating")
ax.bar_label(ax.containers[0])
ax.set_title("Worst 10 Episodes according to IMDb Ratings")
plt.show()

In [None]:
def total_minute(runtime):
    t = 0
    for u in runtime.split(":")[:2]:
        t = 60 * t + int(u)
    return t

In [None]:
total_minute("01:02:00")

In [None]:
df["Runtime Minutes"] = df["Runtime"].apply(total_minute)
df["Runtime Minutes"] = df["Runtime Minutes"].astype("int")

In [None]:
df.head()

In [None]:
episode_runtime_sorted = df.sort_values(by="Runtime Minutes", ascending=False)[["Episode Title", "Runtime Minutes"]]
episode_runtime_sorted.head()

In [None]:
plt.figure()
ax = sns.barplot(data=episode_runtime_sorted.head(10), y="Episode Title", x="Runtime Minutes")
ax.bar_label(ax.containers[0])
ax.set_title("Longest 10 Episodes")
plt.show()

In [None]:
plt.figure()
ax = sns.barplot(data=episode_runtime_sorted.tail(10), y="Episode Title", x="Runtime Minutes")
ax.bar_label(ax.containers[0])
ax.set_title("Shortest 10 Episodes")
plt.show()

In [None]:
def find_season(text):
    season = text.split("-")[0].strip()
    season = season.replace("S", "")
    return season

In [None]:
df["Season"] = df["Season/Episode"].apply(find_season)

In [None]:
df.head()

In [None]:
ratings_per_season =  df.groupby(by="Season", as_index=False)[["IMDb Rating"]].mean()
ratings_per_season

In [None]:
plt.figure()
ax = sns.barplot(data=ratings_per_season, x="Season", y="IMDb Rating")
ax.bar_label(ax.containers[0])
plt.xlabel("Season")
plt.ylabel("Rating")
plt.title("Ratings per Season")
plt.show()

In [None]:
runtime_per_season =  df.groupby(by="Season", as_index=False)[["Runtime Minutes"]].sum()
runtime_per_season

In [None]:
plt.figure()
plt.pie(runtime_per_season["Runtime Minutes"], labels=runtime_per_season["Season"], autopct="%.2f%%", startangle=90)
plt.title("Minutes per Season")
plt.legend(loc="lower right")
plt.show()

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [None]:
neg = []
neu = []
pos = []

In [None]:
def sentiment_score(text):
    score = sentiment_analyzer.polarity_scores(text)
    neg.append(score.get("neg"))
    neu.append(score.get("neu"))
    pos.append(score.get("pos"))
    scores = [score.get("neg"), score.get("neu"), score.get("pos")]
    labels = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
    maxi_idx = scores.index(max(scores))
    res = labels[maxi_idx]
    return res

In [None]:
df["Sentiment"] = df["Storyline"].apply(sentiment_score)

In [None]:
df.head()

In [None]:
df["Sentiment"].value_counts()

In [None]:
df["Neg_Score"] = neg
df["Neu_Score"] = neu
df["Pos_Score"] = pos

In [None]:
df.head()

In [None]:
sentiment_per_episode = df.groupby(by="Season", as_index=False)[["Neg_Score", "Neu_Score", "Pos_Score"]].mean()
sentiment_per_episode

In [None]:
plt.figure()
ax = sns.barplot(data=sentiment_per_episode, x="Season", y="Neg_Score")
ax.bar_label(ax.containers[0])
plt.title("Negative Sentiment Rate per Season")
plt.show()

In [None]:
plt.figure()
ax = sns.barplot(data=sentiment_per_episode, x="Season", y="Neu_Score")
ax.bar_label(ax.containers[0])
plt.title("Neutral Sentiment Rate per Season")
plt.show()

In [None]:
plt.figure()
ax = sns.barplot(data=sentiment_per_episode, x="Season", y="Pos_Score")
ax.bar_label(ax.containers[0])
plt.title("Positive Sentiment Rate per Season")
plt.show()

In [None]:
ep1_story = " ".join(df[df["Season"] == "1"]["Storyline"])
ep1_story

In [None]:
ep2_story = " ".join(df[df["Season"] == "2"]["Storyline"])
ep2_story

In [None]:
ep3_story = " ".join(df[df["Season"] == "3"]["Storyline"])
ep3_story

In [None]:
ep4_story = " ".join(df[df["Season"] == "4"]["Storyline"])
ep4_story

In [None]:
def word_freq(words, title):
    freq_words = words.split(" ")
    freq_words = [i.lower() for i in freq_words]
    freq_punc = []

    for o in freq_words:
        freq_punc += nltk.word_tokenize(o)

    freq_punc = [o for o in freq_punc if o not in string.punctuation]
    freq_freq = Counter(freq_punc)

    freq_top = freq_freq.most_common(15)

    words = [word for word, _ in freq_top]
    counts = [count for _, count in freq_top]

    plt.barh(words, counts)
    plt.title(title)
    plt.xlabel("Frequency")
    plt.ylabel("Words")
    plt.show()

    return freq_top

In [None]:
def print_wordcloud(freq_top):
    dict_top = dict(freq_top)
    wordcloud = WordCloud(width=350, height=350, background_color="black", min_font_size=5).generate_from_frequencies(dict_top)
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
ep1_freq_top = word_freq(ep1_story, "TOP 15 Words in Episode 1 Storyline")

In [None]:
print_wordcloud(ep1_freq_top)

In [None]:
ep2_freq_top = word_freq(ep2_story, "TOP 15 Words in Episode 2 Storyline")

In [None]:
print_wordcloud(ep2_freq_top)

In [None]:
ep3_freq_top = word_freq(ep3_story, "TOP 15 Words in Episode 3 Storyline")

In [None]:
print_wordcloud(ep3_freq_top)

In [None]:
ep4_freq_top = word_freq(ep4_story, "TOP 15 Words in Episode 4 Storyline")

In [None]:
print_wordcloud(ep4_freq_top)

# Preprocess

In [None]:
df.head()

In [None]:
story_df = df[["Storyline", "Season"]]
story_df.head()

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def clean(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = text.strip()
    return text

In [None]:
story_df["Cleaned"] = story_df["Storyline"].apply(clean)

In [None]:
story_df.head()

In [None]:
story_df.info()

In [None]:
story_df["Season"] = story_df["Season"].apply(pd.to_numeric)

In [None]:
story_df = story_df.sample(frac=1).reset_index(drop=True)

# Model

In [None]:
X = story_df["Cleaned"]
y = story_df["Season"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
maxlen = max([len(text) for text in X])
maxlen

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [None]:
with open("tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=maxlen)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
input_dim = len(tokenizer.word_index) + 1
input_dim

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=input_dim, output_dim=32, input_length=maxlen),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
callbacks = [EarlyStopping(monitor="val_loss", patience=5)]

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=2, callbacks=callbacks)

In [None]:
model.save("mr_robot.h5")

In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label="train")
plt.plot(history.history["val_accuracy"], label="valid")
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy %")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="valid")
plt.title("Model Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss %")
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_test)
y_pred = [np.argmax(i) for i in y_pred]

In [None]:
y_test

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
class_names = ["1", "2", "3", "4"]
plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Model Confusion Matrix")
plt.show()

In [None]:
content = "Elliot tries to live a bug-free life."
context_clean = clean(content)
test = tokenizer.texts_to_sequences([context_clean])
test = pad_sequences(test, maxlen=maxlen)
res = model.predict(test)
print(res)
res = np.argmax(res)
print(res)