# NLP Final Project
## Topic
Create an NLP based model to understand the polarity of review and estimate a rating based on the review provided to a movie. Post we train a model to identify the polarity of review we will try to create a regression or classification model to map the review to a rating from range of 1-10.

## Loading and formating of input data

In [1]:
import os
import shutil
import glob

# creating new folders for sorting the ratings
for i in range(1,11):
    try:
        os.mkdir(f'./Cleaned_dataset/ratings_' + str(i))
    except:
        files = glob.glob('./Cleaned_dataset/ratings_' + str(i)+'/*')
        os.remove(files)

# filtering and copying negative reviews
neg_path = './Project Dataset/aclImdb/train/neg/'
neg_files = os.listdir(neg_path)
flip = False
for file_name in neg_files:
    try:
        # copying review to their respective rating folder
        rating = file_name.split('_')[1].split('.')[0]
        if rating == '4':
            if(not flip):
                shutil.copy2(neg_path + file_name,'./Cleaned_dataset/ratings_4/'+ file_name)
                flip = True
            else:
                shutil.copy2(neg_path + file_name,'./Cleaned_dataset/ratings_5/'+ file_name)
                flip = False
        else:
            shutil.copy2(neg_path + file_name,'./Cleaned_dataset/ratings_' + rating +'/' + file_name)
    except:
        print(neg_path + file_name)
        
# filtering and copying positive reviews
pos_path = './Project Dataset/aclImdb/train/pos/'
pos_files = os.listdir(pos_path)
flip = False
for file_name in pos_files:
    try:
        # copying review to their respective rating folder
        rating = file_name.split('_')[1].split('.')[0]
        if rating == '7':
            if(not flip):
                shutil.copy2(pos_path + file_name,'./Cleaned_dataset/ratings_7/' + file_name)
                flip = True
            else:
                shutil.copy2(pos_path + file_name,'./Cleaned_dataset/ratings_6/' + file_name)
                flip = False
        else:
            shutil.copy2(pos_path + file_name,'./Cleaned_dataset/ratings_' + rating + '/' + file_name)
    except:
        print(pos_path + file_name)

# Load data in to data frames

In [None]:
%%time
import pandas as pd
import os

data_set = pd.DataFrame(columns=['Rating','Review'])
ratings_folder_path = os.listdir('./Cleaned_dataset/')
for folder in ratings_folder_path:
    rating = folder.split('_')
    review_files = os.listdir('./Cleaned_dataset/' + folder + '/')
    for review in review_files:
        try:
            fp = open('./Cleaned_dataset/' + folder + '/' + review,'r')
            review_data = fp.read()
            rating_number = rating[1]
            data_set = data_set.append(pd.Series([rating_number,review_data], index=data_set.columns),ignore_index=True)
            fp.close()
        except:
            # try except to deal with error in file reading due to codec issues
            pass

print(data_set.head(10))

# Pre-processing data_set
### Removing punctuations, Stop words and Lemmatizing

In [None]:
%%time
import nltk
# download wordnet if required
# nltk.download('wordnet')
# download stopwords if required
# nltk.download('stopwords')
# loading English stop words  
stop_words = nltk.corpus.stopwords.words('english')
import string
import re

# Basic cleaning
def cleanAndTokenize(review):
    # removing punctuations
    non_punc_words = "".join([character for character in review if character not in string.punctuation])
    
    non_punc_words = non_punc_words.strip()
    
    # tokenizing reviews
    list_of_token = re.split('\W+',non_punc_words)
    
    # removing stop words
    tokens = [word for word in list_of_token if word not in stop_words]
    
    return tokens

# converting words to lower case.
data_set['Review'] = data_set['Review'].apply(lambda review : cleanAndTokenize(str(review).lower()))

# using nltk's wordnet lemmatizer
word_net_lemma = nltk.WordNetLemmatizer()

def lemmatize_data(token_list):
    tokens = [word_net_lemma.lemmatize(word) for word in token_list]
    return tokens

data_set['Review'] = data_set['Review'].apply(lambda review : lemmatize_data(review))

# un-comment to write data to file
# data_set.to_csv('./Cleaned_dataset/tokenized_words.csv',index = False, header=True)

# Vectorizing reviews using TF-IDF.

In [2]:
%%time
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
import nltk
# using nltk's wordnet lemmatizer
word_net_lemma = nltk.WordNetLemmatizer()

data_set = pd.read_csv('./Cleaned_dataset/tokenized_words.csv')

print(data_set.head())

def clean_review(review):
    non_punc_words = "".join([character for character in review if character not in string.punctuation])
    list_of_token = re.split('\W+',non_punc_words)
    tokens = [word_net_lemma.lemmatize(word) for word in list_of_token]
    return tokens

tfidf_vectorize = TfidfVectorizer(analyzer=clean_review)
vectorized_review = tfidf_vectorize.fit_transform(data_set['Review'])

pickle.dump(tfidf_vectorize,open('./pickle_tfidf/tfidf.pickle','wb'))
vectorized_review_df = pd.DataFrame(vectorized_review.toarray())
vectorized_review_df.columns = tfidf_vectorize.get_feature_names()
print(vectorized_review_df.head())

pickle.dump(vectorized_review,open('./pickle_tfidf/vectorized_review.pickle','wb'))

# un-comment to write data to file
# data_set.to_csv('./Cleaned_dataset/vectorized_words.csv',index = False, header=True)

   Rating                                             Review
0       1  ['sorry', 'everyone', 'know', 'supposed', 'art...
1       1  ['little', 'parent', 'took', 'along', 'theater...
2       1  ['film', 'mediocre', 'best', 'angie', 'harmon'...
3       1  ['film', 'one', 'giant', 'pant', 'load', 'paul...
4       1  ['movie', 'must', 'line', 'boring', 'movie', '...
     0   00  000  0000000000001  00001  00015  000s  001  003830  006  ...  \
0  0.0  0.0  0.0            0.0    0.0    0.0   0.0  0.0     0.0  0.0  ...   
1  0.0  0.0  0.0            0.0    0.0    0.0   0.0  0.0     0.0  0.0  ...   
2  0.0  0.0  0.0            0.0    0.0    0.0   0.0  0.0     0.0  0.0  ...   
3  0.0  0.0  0.0            0.0    0.0    0.0   0.0  0.0     0.0  0.0  ...   
4  0.0  0.0  0.0            0.0    0.0    0.0   0.0  0.0     0.0  0.0  ...   

    âº   â½   â¾  âžiâžek    ã  ã¼ber  ãœvegtigris  ãšxtase    ï  œat  
0  0.0  0.0  0.0      0.0  0.0    0.0          0.0      0.0  0.0  0.0  
1  0.0  0.0  0.0     

# Load the TFIDF object

In [18]:
%%time
import pickle
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import re
import numpy as np

class_data = pd.read_csv('./Cleaned_dataset/tokenized_words.csv',usecols = ['Rating'])

tfidf_object = pickle.load(open('./pickle_tfidf/tfidf.pickle','rb'))
vectorized_review_obj = pickle.load(open('./pickle_tfidf/vectorized_review.pickle','rb'))
new_q = '''stupid plot, I will give it really low ratings.'''
new_in = tfidf_object.transform([new_q])
print(new_in.todense())
distances = euclidean_distances(vectorized_review_obj,new_in)
print(distances)
list_min_distance = np.where(distances == np.amin(distances))[:]
for k in list_min_distance:
    print(class_data.loc[k,:])

# class_data = class_data.drop(class_data.index[list_max_distance])
# print(class_data['Rating'].value_counts())
# new_in = pd.DataFrame.sparse.from_spmatrix(new_in)
# print(new_in.head(6))

[[0. 0. 0. ... 0. 0. 0.]]
[[1.404389  ]
 [1.40673365]
 [1.41421356]
 ...
 [1.40625837]
 [1.37487303]
 [1.35379579]]
      Rating
5103      10
   Rating
0       1
Wall time: 421 ms
Parser   : 160 ms
