In [46]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
import unidecode as unidecode
import gensim.downloader as api
import regex as re
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('Train.csv')

In [3]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
data = df.values

In [5]:
print(data.shape)

(40000, 2)


In [6]:
print(data[1, 0])

http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.


In [7]:
def preprocess_text(text) :
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub(r'', text)
    text = unidecode.unidecode(text)
    soup = BeautifulSoup(text, 'html.parser')
    stripped_text = soup.get_text(separator = ' ')
    return stripped_text.lower()

In [50]:
xraw = list(data[:, 0].reshape((-1, )))
Y = data[:, 1]

In [51]:
print(Y.shape)

(40000,)


In [52]:
X = []
for text in xraw :
    X.append(text)
X = np.array(X)

In [53]:
X.shape, X[0]

((40000,),
 "mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it")

In [54]:
from nltk.corpus import stopwords

In [55]:
swords = set(stopwords.words('english'))

In [56]:
wn = WordNetLemmatizer()

In [57]:
print(swords)

{'theirs', 'whom', 'nor', 'down', 'does', 'of', 'yourself', 'haven', 'o', 'this', 'couldn', 'was', 'below', 'being', 'with', 'through', "haven't", 'themselves', 'here', 're', 'our', 'be', 'under', 'you', 'that', 'are', 'into', 'then', 'too', 'wouldn', "couldn't", "should've", 'did', 'shouldn', "needn't", 'only', 'when', 'hasn', 'most', "mustn't", 'in', 'd', 's', 'other', 'on', 'again', 'out', "aren't", 'few', 'further', 'same', 'more', 'a', 'very', 'yourselves', 'herself', 'over', 'how', 'it', 'their', 'own', 'should', 'll', 've', "shouldn't", "wasn't", 'to', 'he', 'between', 'yours', 'or', 'above', 'weren', 'because', "she's", 'ours', 'has', 'isn', "you'll", "hasn't", 'her', 'during', 'after', 'while', "weren't", 'himself', 'its', 'your', 'his', 'y', "wouldn't", 'is', 'just', 'all', "you'd", "that'll", 'ain', 'm', 'who', 'the', 'doesn', 'by', 'there', "don't", 'having', "didn't", 'not', "doesn't", "it's", 'about', 'didn', "you've", 'before', 'wasn', 'needn', 'ourselves', 'but', 'if', 

In [58]:
tokenizer = RegexpTokenizer("[a-zA-Z']+")

In [59]:
def remove_stopwords(text, stopwords) :
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [60]:
def myTokenizer(document) :
    words = tokenizer.tokenize(document)
    words = remove_stopwords(words, swords)
    return words

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
cv = CountVectorizer(lowercase = False)

In [63]:
x_vectorized = []
tokens = []
for i in X :
    words = myTokenizer(i)
    tokens += set(words)
    x_vectorized.append(np.array(words))
tokens = set(tokens)

In [64]:
print(x_vectorized[0])

['mature' 'intelligent' 'highly' 'charged' 'melodrama' 'unbelivebly'
 'filmed' 'China' 'wei' "wei's" 'stunning' 'performance' 'catylast' 'love'
 'triangle' 'simply' 'stunning' 'oppurunity' 'see' 'magnificent' 'film'
 'take']


In [65]:
n_total = Y.shape[0]

In [66]:
ypos = 0
yneg = 0

In [67]:
posmap = {}
negmap = {}
for i in tokens :
    posmap[i] = 0
    negmap[i] = 0

In [68]:
# posmap

In [69]:
for i in range(n_total) :
    if Y[i] == 'pos' :
        ypos += 1
        for x in x_vectorized[i] :
            posmap[x] += 1
    else :
        yneg += 1
        for x in x_vectorized[i] :
            negmap[x] += 1

In [70]:
ypos, yneg

(20011, 19989)

In [71]:
poswords = sum([1 for i in posmap.values() if i > 0])

In [72]:
negwords = sum([1 for i in negmap.values() if i > 0])

In [73]:
totalposwords = sum(posmap.values())
totalnegwords = sum(negmap.values())

In [74]:
poswords, negwords

(96813, 92185)

In [75]:
len(tokens)

134388

In [76]:
prior_prob_pos = ypos / n_total
prior_prob_neg = yneg / n_total

In [77]:
prior_prob_pos, prior_prob_neg

(0.500275, 0.499725)

In [78]:
vocab_size = len(tokens)

In [79]:
vocab_size

134388

In [80]:
def likelihood(word, cls) :
    if word not in tokens :
        return 1/vocab_size
    if cls == 'pos' :
        return (posmap[word] + 1) / (totalposwords + vocab_size)
    else :
        return (negmap[word] + 1) / (totalnegwords + vocab_size)

In [81]:
def predict(document) :
    text = preprocess_text(document)
    words = myTokenizer(text)
    text_vocab = {}
    probpos = 1
    probneg = 1
    for i in set(words) :
        probpos *= likelihood(i, 'pos')
        probneg *= likelihood(i, 'neg')
    probpos *= prior_prob_pos
    probneg *= prior_prob_neg
    if probpos >= probneg :
        return 'pos'
    else :
        return 'neg'

In [82]:
dftest = pd.read_csv('Test.csv')

In [83]:
dftest.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [86]:
xtest = dftest.values.reshape((-1, ))
print(xtest.shape)
print(type(xtest[0]))

(10000,)
<class 'str'>


In [92]:
predictions = []
for i in range(xtest.shape[0]) :
    predictions.append(np.array([i, predict(xtest[i])]))

In [93]:
len(predictions)

10000

In [94]:
predictions[:10]

[array(['0', 'pos'], dtype='<U21'),
 array(['1', 'pos'], dtype='<U21'),
 array(['2', 'pos'], dtype='<U21'),
 array(['3', 'pos'], dtype='<U21'),
 array(['4', 'pos'], dtype='<U21'),
 array(['5', 'neg'], dtype='<U21'),
 array(['6', 'pos'], dtype='<U21'),
 array(['7', 'pos'], dtype='<U21'),
 array(['8', 'pos'], dtype='<U21'),
 array(['9', 'pos'], dtype='<U21')]

In [95]:
ytest = np.array(predictions)

In [100]:
ans = pd.DataFrame(ytest, columns = ['Id', 'label'])

In [102]:
ans.to_csv('ans.csv', index = False)