#### Downloading data in a zip file and unzip it

In [1]:
import urllib.request
import io
import gzip
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz'
response = urllib.request.urlopen(url)
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open("data.json", 'wb') as outfile:
    outfile.write(decompressed_file.read())

#### Making a list of words from the json we unzipped above

In [1]:
import json
with open("data.json", 'r') as f:
  x = f.read()
  reviewsList = x.split("\n")
  listOfDicts = []
  for review in reviewsList:
    try:
      p = json.loads(review)
      listOfDicts.append(p)
    except:
      print(review)
listOfDicts[:1]




[{'asin': '120401325X',
  'helpful': [0, 0],
  'overall': 4.0,
  'reviewText': "They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again",
  'reviewTime': '05 21, 2014',
  'reviewerID': 'A30TL5EWN6DFXT',
  'reviewerName': 'christina',
  'summary': 'Looks Good',
  'unixReviewTime': 1400630400}]

In [None]:
# test = listOfDicts[90000:]
# listOfDicts = listOfDicts[:70000]
# print(len(listOfDicts))

#### Import nltk to get stopwords which we are going to use later to remove in the text we downloaded above

In [2]:
import nltk
#nltk.download()
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

#### From listofDict we made above we remove stop words and prefix, suffix using PorterStemmer and WordNetLemmatizer

In [3]:
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
trainingData = []
porter = PorterStemmer()
wnl = WordNetLemmatizer()
for review in listOfDicts:
    #text = review["reviewText"].split(" ")
    text = nltk.word_tokenize(review["reviewText"])
    rwords=[]
    for t in text:
        if t not in stop_words:
            t = porter.stem(t)
            t = wnl.lemmatize(t)
            rwords.append(t)
    rating = review["overall"]
    trainingData.append((rwords,rating))

 
trainingData[:2]



[(['they',
   'look',
   'good',
   'stick',
   'good',
   '!',
   'I',
   "n't",
   'like',
   'round',
   'shape',
   'I',
   'alway',
   'bump',
   'siri',
   'kept',
   'pop',
   'irrit',
   '.',
   'I',
   'wo',
   "n't",
   'buy',
   'product',
   'like'],
  4.0),
 (['these',
   'sticker',
   'work',
   'like',
   'review',
   'say',
   '.',
   'they',
   'stick',
   'great',
   'stay',
   'phone',
   '.',
   'they',
   'super',
   'stylish',
   'I',
   'share',
   'sister',
   '.',
   ':',
   ')'],
  5.0)]

#### Here we are trying to shuffle data so that we randomly pick data while training

In [4]:
import random
random.shuffle(trainingData)
trainingData[:2]

[(['the',
   'belkin',
   'charger',
   'nice',
   'construct',
   'easi',
   'hold',
   'insert',
   'remov',
   'socket',
   '.',
   'I',
   'like',
   'taper',
   'bodi',
   'soft',
   'rubber',
   'green',
   'strip',
   'aid',
   'grip',
   '.',
   'the',
   'charger',
   'work',
   'fine',
   'ipad',
   '-',
   'I',
   "'m",
   'equip',
   'measur',
   'actual',
   'power',
   'output',
   ',',
   'I',
   'reason',
   'believ',
   "n't",
   'meet',
   'spec.th',
   'cabl',
   'fairli',
   'typical-look',
   'lightn',
   'cabl',
   '.',
   'I',
   'like',
   'lightn',
   'end',
   'almost',
   'size',
   'appl',
   "'s",
   ',',
   'wo',
   "n't",
   'issu',
   'case',
   'cabl',
   ',',
   'amazon',
   'basic',
   ',',
   '.',
   'I',
   'also',
   'like',
   "'s",
   'four',
   'foot',
   'instead',
   'usual',
   'three.i',
   'knock',
   'half',
   'star',
   'led',
   'indic',
   'power',
   '.',
   'nearli',
   'third-parti',
   'charger',
   'I',
   'use',
   'led',
   '.',

#### Now lets try to get word features to the word list we made above

In [5]:
# Get all the words from all the tweets
def get_words_in_tweets(tweets):
    all_words = []
    all_words2 = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    for word in all_words:
      if len(word)>2:
        all_words2.append(word)
    return all_words2
  

# Extract the most frequent words
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = [w for (w, c) in wordlist.most_common(2000)]
    return word_features

word_features = get_word_features(get_words_in_tweets(trainingData))
word_features[:5]

['phone', 'case', 'use', "n't", 'the']

#### lets check length of word list and word features

In [6]:
print(len(trainingData))
print(len(word_features))

194439
2000


#### Now we define a function, which we are going to use later, to extract features for a document

In [7]:
def extract_features(document): # I renamed it; that's all
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in document_words)
    return features

#### Now let's select training and testing set after applying extract features function we defined above

In [8]:
training_set = nltk.classify.apply_features(extract_features, trainingData)
new_training_set = training_set[:70000]
new_testing_set = training_set[80000:90000]
print(len(new_training_set))

70000


#### Lets apply LinearSVC to training set we selected above

In [9]:
import nltk.classify
from sklearn.svm import LinearSVC

classifier = nltk.classify.SklearnClassifier(LinearSVC())
classifier.train(new_training_set)




<SklearnClassifier(LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))>

#### Now we shall check the accuracy of the classifier we ran above

In [11]:
test1 = new_testing_set[0:10000]
count = 0
for t in test1:
    predict = classifier.classify(t[0])
    if predict == t[1]:
        count = count + 1
print(count/10000)
    

0.6232


#### Here we shall save the LinearSVC classifier model we trained above

In [15]:
import pickle
pickle.dump(classifier,open('Models/SVMClassifier','wb'))

#### Now lets load and test LinearSVC classifier model we saved in above step

In [16]:
loadedClassifier = pickle.load(open('Models/SVMClassifier','rb'))
t = "Samsung was much better than Iphone"
predict = classifier.classify(extract_features(t.split()))
print(predict)

5.0


#### Simply we will see the content of extract feature for simple text

In [37]:
t = "Iphone is working"
temp = extract_features(t.split())
print(temp)

{'phone': False, 'case': False, 'charg': False, 'like': False, 'work': False, 'great': False, 'batteri': False, 'screen': False, 'would': False, 'good': False, 'look': False, 'iphon': False, 'well': False, 'time': False, 'charger': False, 'protect': False, 'product': False, 'devic': False, 'realli': False, 'love': False, 'also': False, 'protector': False, 'need': False, 'make': False, 'nice': False, 'back': False, 'much': False, 'price': False, 'littl': False, 'qualiti': False, 'cover': False, 'even': False, 'cabl': False, 'easi': False, 'power': False, 'button': False, 'feel': False, 'want': False, 'recommend': False, 'still': False, 'come': False, 'thing': False, 'take': False, 'bought': False, 'better': False, 'review': False, 'sound': False, 'could': False, 'purchas': False, 'first': False, 'plug': False, 'color': False, 'problem': False, 'keep': False, 'port': False, 'seem': False, 'without': False, 'headset': False, 'hold': False, 'drop': False, 'around': False, 'galaxi': False, 

#### Now we created list of feature and labels to use in different algorithms

In [46]:
print(type(listOfDicts))

featuresList = []
labelsList = []
for review in listOfDicts:
    featuresList.append(review["reviewText"])
    labelsList.append(review["overall"])

<class 'list'>


#### Here we fit the features list to use later

#### Lets see the content of features list we created above

In [43]:
print(featuresList)

0         I feel so LUCKY to have found this used (phone...
1         nice phone, nice up grade from my pantach revu...
2                                              Very pleased
3         It works good but it goes slow sometimes but i...
4         Great phone to replace my lost phone. The only...
5         I already had a phone with problems... I know ...
6         The charging port was loose. I got that solder...
7         Phone looks good but wouldn't stay charged, ha...
8         I originally was using the Samsung S2 Galaxy f...
9         It's battery life is great. It's very responsi...
10        My fiance had this phone previously, but cause...
11        This is a great product it came after two days...
12        These guys are the best! I had a little situat...
13        I'm really disappointed about my phone and ser...
14        Ordered this phone as a replacement for the sa...
15        Had this phone before and loved it but was not...
16        I was able to get the phone I 

#### Import all classifiers/algorithms we are going to use later

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score
import pandas as pd

#### Select few of the features and labels for training

In [21]:
featuresTemp = features[:70000]
labelsTemp = labels[:70000]
print(featuresTemp.shape)
print(len(labelsTemp))

(70000, 247011)
70000


#### Here we will run the algorithms on the training data we selected above. Also we are running all alogrithms in one go.

In [22]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, featuresTemp, labelsTemp, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#### Lets check the accuracy of the algorithms we ran above

In [23]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC                 0.606400
LogisticRegression        0.609100
MultinomialNB             0.527229
RandomForestClassifier    0.526671
Name: accuracy, dtype: float64

#### Now we will save each of the models we got above

In [24]:

for model in models:
    model_name = model.__class__.__name__
    pickle.dump(classifier,open('Models/' + model_name,'wb'))

#### Here we are trying to train using neural network

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(featuresTemp, labelsTemp)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
mlp.fit(X_train,y_train)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

#### Saving the trained neural network for later use

In [26]:
pickle.dump(mlp,open('Models/NeuralNetwork','wb'))

#### Now we are trying to load the saved model and test accuracy of it.

In [28]:
mlp2 = pickle.load(open('Models/NeuralNetwork','rb'))
predictions = mlp2.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.4946285714285714


#### Loading new data set

In [127]:
import pandas as pd
data = pd.read_csv('Amazon_Unlocked_Mobile.csv')


#### Shuffle the loaded data to randomly select data while training

In [128]:
from sklearn.utils import shuffle
data = shuffle(data)

#### Checking the number of records for each rating

In [129]:
data_2 = data[:70000]
data_3 = data_2[['Rating','Reviews']]
data_3_1 = data_3[data_3.Rating == 1]
data_3_2 = data_3[data_3.Rating == 2]
data_3_3 = data_3[data_3.Rating == 3]
data_3_4 = data_3[data_3.Rating == 4]
data_3_5 = data_3[data_3.Rating == 5]
print(len(data_3_1))
print(len(data_3_2))
print(len(data_3_3))
print(len(data_3_4))
print(len(data_3_5))

12026
4193
5500
10432
37849


#### Select data columns in the data_2 dataframe

In [130]:
rat_data = data_2['Rating']
rev_data = data_2['Reviews']

#### Fitting the data to extract features to be used later

In [131]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(rev_data.values.astype('U'))
labels = rat_data


#### Running the Linear SVC on new data set

In [77]:
models = [
    
    LinearSVC(),
    
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, featuresTemp, labelsTemp, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))

cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#### Checking the accuracy of the classifier

In [78]:
cv_df.groupby('model_name').accuracy.mean()

model_name
LinearSVC    0.6064
Name: accuracy, dtype: float64

#### Here we are trying to fit after train and test split

In [132]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_2.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

#### Predicting the output using classifier we created above

In [133]:
list_1 = []
list_1.append('iphone is worst ')
inp = tfidf.transform(list_1)
out2 = model.predict(inp)
print(out2)

[1]


#### Saving the model and TfidfVectorizer to use for prediction later

In [134]:
pickle.dump(tfidf,open('Models/tfidf','wb'))
pickle.dump(model,open('Models/LinearSVC','wb'))