In [2]:
from nltk import word_tokenize, pos_tag, ne_chunk
import string
import warnings
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from wordcloud import WordCloud,STOPWORDS
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import RidgeClassifier,LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


ModuleNotFoundError: No module named 'nltk'

In [None]:
df_train = pd.read_csv("Data/train.csv")
df_game = pd.read_csv("Data/game_overview.csv")
df_test = pd.read_csv("Data/test.csv")


In [None]:
print(df_train.head())
print(df_test.head())


In [None]:
print("Number of rows are", df_train.shape[0], ".Number of columns is", df_train.shape[1])
print("Number of rows are", df_test.shape[0], ".Number of columns is", df_test.shape[1])


In [None]:
# Data Type of each column.Return Object.
print("Dataframe data types")
print(df_train.dtypes)
print(df_test.dtypes)


In [None]:
# Return column names as Index object.
print("Dataframe column data types")
print(df_train.columns)
print(df_test.columns)


In [None]:
# Class distribution of target available.
print("Distribution of target variable")
print(df_train.user_suggestion.value_counts())


In [None]:
# Class distribution of title variable.
print("Distribution of title variable")
print(df_train.title.value_counts())


In [None]:
print("Converting all letters to lowercase")
# Convert text to lowercase.
def tolowercase(text):
    text = text.lower()
    return text

df_train['user_review'] = df_train.user_review.apply(tolowercase)
df_test['user_review'] = df_test.user_review.apply(tolowercase)

In [None]:
print("Removing punctuation")

# Remove punctuation.
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

df_train['user_review'] = df_train.user_review.apply(remove_punctuation)
df_test['user_review'] = df_test.user_review.apply(remove_punctuation)


In [None]:
print("Perform lemmatization")

# Lemmatization:
def do_lemmatization(text):
    lemma_words = set([])
    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)
    for word in text:
        lemma_words.add(lemmatizer.lemmatize(word))
    return " ".join(lemma_words)

df_train['user_review'] = df_train.user_review.apply(do_lemmatization)
df_test['user_review'] = df_test.user_review.apply(do_lemmatization)


In [None]:
print("Perform named entity recognition")

def named_entity_recognition(text):
    result = ne_chunk(pos_tag(word_tokenize(text)))
    return result

text_1 = df_train.user_review[0]
ner = named_entity_recognition(text_1)
print(ner)




In [None]:
print("Part-of-speech tagging using NLTK")

def pos_tagging(text):
    text = word_tokenize(text)
    tokens_tag = pos_tag(text)
    return tokens_tag

df_train['pos_tagging'] = df_train.user_review.apply(pos_tagging)
print(df_train['pos_tagging'])


In [None]:
# NUMBER OF 15 MOST FREQUENT TERMS.
token = nltk.word_tokenize(''.join(df_train.user_review))
frequent = nltk.FreqDist(token)
print(frequent.most_common(15))


In [None]:
# Text with highest number of words.
df_train['number_of_words'] = df_train['user_review'].apply(lambda x: len(str(x).split()))
print('Maximum number of word',df_train['number_of_words'].max())
print('\nSentence:\n',df_train[df_train['number_of_words'] == 587]['user_review'].values)


In [None]:
X = df_train['user_review']
y = df_train['user_suggestion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


In [None]:
# # Feature Extraction using TFIDF-Char Based.
tfidf_vec = TfidfVectorizer(ngram_range=(1,5), stop_words='english', analyzer='char',max_features=5000)
print(type(tfidf_vec))  # TfidfVectorizer class.
train_tfidf_vec = tfidf_vec.fit_transform(X_train)
print(type(train_tfidf_vec))  # Sparse CSR matrix.
valid_tfidf_vec = tfidf_vec.transform(X_test)
print(type(valid_tfidf_vec))  # Sparse CSR matrix.
test_tfidf_vec = tfidf_vec.transform(df_test['user_review'])
train_vector_array = train_tfidf_vec.toarray()
valid_vector_array = valid_tfidf_vec.toarray()
test_vector_array = test_tfidf_vec.toarray()

print(train_vector_array.shape)
print(valid_vector_array.shape)
print(test_vector_array.shape)


In [None]:
# Linear support vector classifier.
lsvc = LinearSVC(C=1,loss= 'hinge',random_state=999)
lsvc.fit(train_vector_array, y_train)
y_pred = lsvc.predict(valid_vector_array)
print(f1_score(y_test,y_pred,average='micro'))   # 0.8250


In [None]:
# Logistic Regression.
lr = LogisticRegression(random_state=999)
lr.fit(train_vector_array, y_train)
y_pred = lr.predict(valid_vector_array)
print(f1_score(y_test,y_pred,average='micro'))  # 0.8128




In [None]:
# Random Forest Classifier.
rfc = RandomForestClassifier(n_estimators=300, random_state=999)
rfc.fit(train_vector_array, y_train)
pred = rfc.predict(valid_vector_array)
print(f1_score(y_test, pred))  # 0.8270



In [None]:
# Bernoulli Naive Bayes.
bnb = BernoulliNB()
bnb.fit(train_vector_array,y_train)
y_pred = bnb.predict(valid_vector_array)
print(f1_score(y_test,y_pred,average='micro'))  # 0.6547


In [None]:
# Ridge Classifier.
ridge = RidgeClassifier(random_state=999)
ridge.fit(train_vector_array,y_train)
y_pred = ridge.predict(valid_vector_array)
print(f1_score(y_test,y_pred,average='micro'))  # 0.8348


In [None]:
pred_final = ridge.predict((test_vector_array))
df = pd.DataFrame({'review_id': df_test['review_id'], 'user_suggestion': pred_final})
df.to_csv("submission_results.csv")

In [None]:
#################################################
# LDA Topic Modelling:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(X_train)
print(doc_term_matrix)

In [None]:
# Each of x documents is represented as y dimensional vector,which means that our vocabulary has y words.
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=5,
                                random_state=0)
print(type(LDA))  # LatentDirichletAllocation class.
LDA.fit(doc_term_matrix)
print(LDA.components_)

In [None]:
# For each topic,each word of the document is assigned a weight.
# Higher weight means it is the top word of the topic.
# It is a multidimensional array.Each row represent the topic,each column represents the word in a document.
# Shape = [n_topics,n_words] or [n_components, n_features]

# Define helper function to print top words for each topic.
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        print(topic)
        message = "\nTopic #{}:".format(index)
        print(message)
        print([(feature_names[i], topic[i]) for i in topic.argsort()[:-n_top_words - 1:-1]])
        # feature_names[i] is a word,topic[i] is the weight of the word for that topic.
        print("=" * 70)

number_of_words = 50
print("\nTopics in LDA model: ")
tf_feature_names = count_vect.get_feature_names()
print_top_words(LDA, tf_feature_names, number_of_words)
#####################################################

In [None]:
######################################################
# Topic Modelling : NMF: Non-Matrix factorization.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(X_train)
print(doc_term_matrix)

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=5, random_state=42)
print(type(nmf))
nmf.fit(doc_term_matrix)
print(nmf.components_)

In [None]:
# # For each topic,each word of the document is assigned a weight.
# # Higher weight means it is the top word of the topic.
# # It is a multidimensional array.Each row represent the topic,each column represents the word in a document.
# Shape - [n_topics,n_words] or [n_components, n_features].
# Factorization matrix.

# Define helper function to print top words.
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        print(message)
        print([(feature_names[i], topic[i]) for i in topic.argsort()[:-n_top_words - 1:-1]])
        print("=" * 70)

number_of_words = 50
print("\nTopics in NMF model: ")
tf_feature_names = tfidf_vect.get_feature_names()  # note that tf_vectorizer is an LemmaCountVectorizer object and with this command we get the whole dictionary of words
print_top_words(nmf, tf_feature_names, number_of_words)

######################################################
