In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder



import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet


nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Filepaths
description_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/description.txt"
test_data_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt"
test_data_sol_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data_solution.txt"
train_data_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt"


In [None]:
# Function to read txt file
def read_txt_file(filepath):
    with open(filepath, 'r') as file:
        content = file.read()
    return content

In [None]:
# Reading description file
description = read_txt_file(description_path)
print(description)

Here we see that in order to create data frame we will use delimiter parameter ```sep=':::'``` in ```pd.read_csv()``` function.

In [None]:
# creating train_data dataframe

train_df = pd.read_csv(train_data_path, sep=':::', header=None, engine='python')

# Give columns comprehensive names
train_df.columns = ['Id','Title','Genre','Description']

print(train_df.info())
print(type(train_df))
train_df.head()

In [None]:
# creating test_data dataframe

test_df = pd.read_csv(test_data_path, sep=':::', header=None, engine='python')

# Give columns comprehensive names
test_df.columns = ['Id','Title','Description']

print(test_df.info())
print(type(test_df))
test_df.head()

In [None]:
# creating test_data_solution dataframe

test_sol_df = pd.read_csv(test_data_sol_path, sep=':::', header=None, engine='python')

# Give columns comprehensive names
test_sol_df.columns = ['Id','Title','Genre','Description']

print(test_sol_df.info())
print(type(test_sol_df))
test_sol_df.head()

# **DATA CLEANING**

In [None]:
train_df.info()

Here we see that ```Titles``` won't help in classifying the ```Genre``` as many of them are in other languages than English. So we will only use ```Description``` to extract features from.

In [None]:
# missing values
train_df.isnull().sum()

No missing value in the dataset

In [None]:
# check for duplicate values
train_df.duplicated().sum()

No duplicate values in the dataset

In [None]:
# output classes
output_categories = train_df.Genre.unique()
print(len(output_categories))
output_categories

In [None]:
# Getting all the classes (Genres)
genres = train_df.Genre
print(genres)

In [None]:
# Label encoding the output categories
le = LabelEncoder()
le.fit(genres)
le.classes_

In [None]:
genres = le.transform(genres)
genres

In [None]:
train_df['Labeled Genre'] = genres

In [None]:
train_df.head()

In [None]:
cat = le.inverse_transform([2])
# cat = str(cat)
print(cat[0])
print(type(cat[0]))

In [None]:
# print(cat)

# **PERFORMING EDA**

In [None]:
train_df.head()

In [None]:
train_df['Genre'].value_counts()

In order to better visualize the distribution of output categories we plot the pie chart

In [None]:
plt.figure(figsize=(8,8))
values = train_df['Genre'].value_counts()
plt.pie(values, labels=output_categories, autopct='%1.1f', startangle=90, radius=1.2, explode=(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.75, 0.65, 0.55, 0.45, 0.35, 0.2, 0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2))
plt.show()

In [None]:
# Plotting the number of counts of each genre in the training set
train_df.Genre.value_counts()[train_df.Genre.unique()].plot(kind='bar')
plt.show()

From the above two visualizations we clearly see that the data is imbalanced

In [None]:
# Converting the Description column (Series type) to numpy array
desc = train_df['Description']
for _ in range(2):
    print(desc[_])
    print("\n")
desc_arr = np.array(desc.tolist())
print(type(desc_arr))

In [None]:
wordnet = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
    # removing special characters & numbers and just keeping alphabets
    var = re.sub("[^a-zA-Z]", " ", text)

    # lowercasing
    var = var.lower()

    var = var.split()
    var = [wordnet.lemmatize(word) for word in var if not word in set(stopwords.words('english'))] # removing stopwords
    var = ' '.join(var)

    return var

In [None]:
print(train_df['Description'][0])
t = preprocess_text(train_df['Description'][0])
print("\n>>> After processing:")
t

In [None]:
# Now applying the "preprocess_text" func. to the description column and adding a column which will contain the processed descriptions
train_df['Processed Description'] = train_df['Description'].apply(preprocess_text)

In [None]:
train_df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=300, height=300, min_font_size=10, background_color='white')

In [None]:
fig, axs = plt.subplots(nrows=5, ncols=6,figsize=(16,20))
plt.subplots_adjust(hspace=0.5)
fig.suptitle('Plotting word clouds for each Genre category')

for i in range(len(train_df['Genre'].unique())):
    ax = axs[i//6,i%6]
    cloud = wc.generate(train_df[train_df['Labeled Genre'] == i]['Processed Description'].str.cat(sep=" "))
#     plt.figure(figsize=(8,4))
    ax.imshow(cloud)
    ax.set_title(le.inverse_transform([i])[0])

# Hide any remaining empty subplots
for i in range(len(train_df['Genre'].unique()), 5*6):
    axs.flatten()[i].axis('off')

plt.show()

In [None]:
# Now building a corpus which will be a 2d list with 27 rows one row for each genre's description corpus
corpus = []
for i in range(len(train_df['Genre'].unique())):
    corpus_i = []
    for desc in train_df[train_df['Labeled Genre'] == i]['Processed Description'].tolist():
        for word in desc.split():
            corpus_i.append(word)
    corpus.append(corpus_i)


# the length of corpus should be 27
print(len(corpus))

In [None]:
from collections import Counter

In [None]:
fig, axs = plt.subplots(nrows=9, ncols=3,figsize=(16,30))
plt.subplots_adjust(hspace=0.75)
fig.suptitle('Bar Graphs showing the Most Common words')

for i, l in enumerate(corpus):
    colors = np.random.rand(20, 3)
    ax = axs[i//3,i%3]
    df = pd.DataFrame(Counter(l).most_common(20))
    ax.bar(df[0], df[1], color=colors)
    ax.set_title(le.inverse_transform([i])[0])

    # Setting tick positions and labels
    ax.set_xticks(np.arange(len(df[0])))
    ax.set_xticklabels(df[0].tolist(),rotation=90)

plt.xticks(rotation='vertical')
plt.show()

# **BUILDING THE MODEL**

In [None]:
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec

In [None]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
wv = api.load('word2vec-google-news-300')

In [None]:
type(wv)

In [None]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)

    i = 0
    for w in sent.split():
        if w in wv:
            i += 1
            wv_res += wv[w]
    wv_res = wv_res/i
    return wv_res

In [None]:
train_df['Vectorized'] = train_df['Processed Description'].apply(sent_vec)

In [None]:
train_df.head()

In [None]:
X_train = train_df['Vectorized'].tolist()
y_train = train_df['Labeled Genre'].tolist()

In [None]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score, classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
gnb = GaussianNB()
bnb = BernoulliNB()
lg = LogisticRegression(multi_class='ovr')

In [None]:
gnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)
lg.fit(X_train, y_train)

In [None]:
train_df.columns

In [None]:
test_sol_df['Processed Description'] = test_sol_df['Description'].apply(preprocess_text)

In [None]:
test_sol_df['Vectorized'] = test_sol_df['Processed Description'].apply(sent_vec)

In [None]:
test_genres = test_sol_df['Genre']
test_sol_df['Labeled Genre'] = le.transform(test_genres)
test_sol_df.head()

In [None]:
X_test = test_sol_df['Vectorized'].tolist()
y_test = test_sol_df['Labeled Genre'].tolist()

In [None]:
print(len(X_test))
print(len(y_test))

In [None]:
y_pred = lg.predict(X_test)
print("Accuracy: ",accuracy_score(y_pred,y_test)*100,"%")
print("Weighted Avg Precision:", precision_score(y_pred, y_test, average='weighted')*100,"%")
print(classification_report(y_pred,y_test))

In [None]:
y_pred = gnb.predict(X_test)
print("Accuracy: ",accuracy_score(y_pred,y_test)*100,"%")
print("Weighted Avg Precision:", precision_score(y_pred, y_test, average='weighted')*100,"%")
print(classification_report(y_pred,y_test))

In [None]:
y_pred = bnb.predict(X_test)
print("Accuracy: ",accuracy_score(y_pred,y_test)*100,"%")
print("Weighted Avg Precision:", precision_score(y_pred, y_test, average='weighted')*100,"%")
print(classification_report(y_pred,y_test))

***Result***
---
Among the three classification reports of LogisticRegression, GaussianNB and BernoulliNB models, we see that LogisticRegression has the best performance.
---