In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import math
from collections import defaultdict,Counter

In [3]:
df = pd.read_csv('datasets/Train/Train.csv')
train = np.array(df)
print(train.shape)

(40000, 2)


In [4]:
print(train[:,0].shape)

(40000,)


In [4]:
allReviews = train[:,0]
allRatings = train[:,1]

In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [6]:
tokenizer = RegexpTokenizer(r'(?<!\S)[A-Za-z]+(?!\S)|(?<!\S)[A-Za-z]+(?=:(?!\S))')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [7]:
def getCleanedReview(review):
    review = review.lower()
    review = review.replace('<br /><br />',' ')
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    #Remove Stopwords
    new_tokens = [token for token in tokens if token not in en_stopwords]
    #Stemmization
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

    

In [19]:
print(getCleanedReview('efwe wef ewf e  fe fewwer frgreger  rgr egerg reger g reer ger ger'))

efw wef ewf e fe fewwer frgreger rgr egerg reger g reer ger ger


In [8]:
def cleaning(allReviews):
    count=1
    cleaned_reviews = []
    for review in allReviews:
        review = str(review)
        cleaned_reviews.append(getCleanedReview(review))
    return cleaned_reviews

In [9]:
cleaned_reviews = cleaning(allReviews)

In [10]:
cleaned_reviews = np.array(cleaned_reviews)

In [26]:
print(cleaned_reviews.shape[0])

40000


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [25]:
# cleaned_reviews.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [29]:
print(allRatings[:1000])

['pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos'
 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg'
 'pos' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos' 'neg' 'neg'
 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg'
 'neg' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg'
 'neg' 'pos' 'neg' 'neg' 'pos' 'pos' 'pos' 'pos' 'neg' 'pos' 'neg' 'pos'
 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'pos'
 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'pos' 'neg'
 'neg' 'pos' 'neg' 'pos' 'pos' 'pos' 'neg' 'neg' 'pos' 'neg' 'neg' 'pos'
 'neg' 'pos' 'neg' 'pos' 'pos' 'neg' 'pos' 'neg' 'neg' 'neg' 'pos' 'pos'
 'neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'pos' 'neg' 'n

In [12]:
def fit(cleaned_reviews, allRatings):
    n_class_items = {}
    log_class_priors = {}
    word_counts = {}
    vocab = set()
    
    n = cleaned_reviews.shape[0]
    # For positive Statements
    pos_data = cleaned_reviews[np.where(allRatings=='pos')]
    # For negative Statements
    neg_data = cleaned_reviews[np.where(allRatings=='neg')]
    
    n_class_items['pos'] = pos_data.shape[0]
    n_class_items['neg'] = neg_data.shape[0]
    
    log_class_priors['pos'] = math.log(n_class_items['pos']/n)
    log_class_priors['neg'] = math.log(n_class_items['neg']/n)
    
    word_counts['pos'] = defaultdict(lambda: 0)
    word_counts['neg'] = defaultdict(lambda: 0)
    
    for text in pos_data:
        text = text.split()
        counts = Counter(text)
        for word, count in counts.items():
            vocab.add(word)
            word_counts['pos'][word] += count
    
    for text in neg_data:
        text = text.split()
        counts = Counter(text)
        for word, count in counts.items():
            vocab.add(word)
            word_counts['neg'][word] += count
    
    return n_class_items, log_class_priors, word_counts, vocab

            

In [13]:
def predict(test, n_clas_items, log_class_priors, word_counts, vocab):
    result = []
    for text in test:
        class_scores = {c: log_class_priors[c] for c in ['pos','neg']}
        words = (getCleanedReview(text).split())
        
        for word in words:
            # pos
            num = word_counts['pos'][word]+1
            denom = n_class_items['pos']+ len(vocab)
            log_w_pos = math.log(num/denom)
            class_scores['pos'] += log_w_pos
            
            #neg
            num = word_counts['neg'][word]+1
            denom = n_class_items['neg']+ len(vocab)
            log_w_pos = math.log(num/denom)
            class_scores['neg'] += log_w_pos
        
        
        result.append(max(class_scores,key = class_scores.get))
    return result

In [14]:
df = pd.read_csv('datasets/Test/Test.csv')
test = np.array(df)
print(test.shape)

(10000, 1)


In [15]:
n_class_items, log_class_priors, word_counts, vocab = fit(cleaned_reviews, allRatings)

In [25]:
test = test[:,0]
print(test.shape)

print[test[0].shape]

IndexError: too many indices for array

In [27]:
result = predict(test,n_class_items, log_class_priors, word_counts, vocab)

In [28]:
df = pd.DataFrame(result)

In [29]:
df.to_csv('datasets/Test/output.csv')