In [1]:
from numpy import array, zeros, argmin, inf, equal, ndim
from scipy.spatial.distance import cdist
import pandas as pd


def dtw(x, y, dist):
    """
    Computes Dynamic Time Warping (DTW) of two sequences.
    :param array x: N1*M array
    :param array y: N2*M array
    :param func dist: distance used as cost measure
    Returns the minimum distance, the cost matrix, the accumulated cost matrix, and the wrap path.
    """
    assert len(x)
    assert len(y)
    r, c = len(x), len(y)
    D0 = zeros((r + 1, c + 1))
    D0[0, 1:] = inf
    D0[1:, 0] = inf
    D1 = D0[1:, 1:]  # view
    for i in range(r):
        for j in range(c):
            D1[i, j] = dist(x[i], y[j])
    C = D1.copy()
    for i in range(r):
        for j in range(c):
            D1[i, j] += min(D0[i, j], D0[i, j + 1], D0[i + 1, j])
    if len(x) == 1:
        path = zeros(len(y)), range(len(y))
    elif len(y) == 1:
        path = range(len(x)), zeros(len(x))
    else:
        path = _traceback(D0)
    return D1[-1, -1] / sum(D1.shape), C, D1, path


def _traceback(D):
    i, j = array(D.shape) - 2
    p, q = [i], [j]
    while ((i > 0) or (j > 0)):
        tb = argmin((D[i, j], D[i, j + 1], D[i + 1, j]))
        if (tb == 0):
            i -= 1
            j -= 1
        elif (tb == 1):
            i -= 1
        else:  # (tb == 2):
            j -= 1
        p.insert(0, i)
        q.insert(0, j)
    return array(p), array(q)


def dist_for_float(p1, p2):
    dist = 0.0
    elem_type = type(p1)
    if  elem_type == float or elem_type == int :
        dist = float(abs(p1 - p2))
    else :
        sumval = 0.0
        for i in range(len(p1)) :
            sumval += pow(p1[i] - p2[i], 2)
        dist = pow(sumval, 0.5)
    return dist

In [2]:
file_path = './Problem_C_Data/hair_dryer.tsv'
file_path2 = './Problem_C_Data/microwave.tsv'
file_path3 = './Problem_C_Data/pacifier.tsv'

reviews = pd.read_csv(file_path, sep='\t', header=0)
reviews2 = pd.read_csv(file_path2, sep='\t', header=0)
reviews3 = pd.read_csv(file_path3, sep='\t', header=0)

In [3]:
title = reviews['product_title'].value_counts().index[0]

In [6]:
kindle = reviews[reviews.product_title==title]
comments = pd.concat([kindle['review_body']+". "+ kindle['review_headline'],kindle['star_rating'],kindle['helpful_votes']/kindle['total_votes'],kindle['vine']],axis=1)
comments.columns=['text','rating','recommend','vine']
comments.head()

Unnamed: 0,text,rating,recommend,vine
0,Works great!. Works great,5,,N
4,I just got this last week. I think's great. Th...,4,,N
23,After using the same blowdryer for probably 8-...,5,,N
50,The ends are pink. Not what I expected. Three ...,3,,N
79,I was hoping for a quieter blow dryer. This is...,3,,N


In [7]:
import string
import nltk
from nltk import PorterStemmer
import re 

stopwords = nltk.corpus.stopwords.words('english')
ps = PorterStemmer()
wn = nltk.WordNetLemmatizer()


def clean_stem (sent): 
    temp1 ="".join(x for x in sent if x not in string.punctuation)
    temp2 = re.split('\W+',temp1.lower())
    temp3 = [ps.stem(x) for x in temp2 if x not in stopwords]
    return temp3

def clean_lemma (sent): 
    temp1 ="".join(x for x in sent if x not in string.punctuation)
    temp2 = re.split('\W+',temp1.lower())
    temp3 = [wn.lemmatize(x) for x in temp2 if x not in stopwords]
    return temp3

text="Hello this is, my happiest place. organize, organizes, and organizing in Happy world ! with happiness ..\
so much of happy!! "

print("Stemmed " + "-".join(clean_stem(text)))
print("Lemmatized " + "-".join(clean_lemma(text)))

Stemmed hello-happiest-place-organ-organ-organ-happi-world-happi-much-happi-
Lemmatized hello-happiest-place-organize-organizes-organizing-happy-world-happiness-much-happy-


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectstem = TfidfVectorizer(analyzer=clean_stem)
vectlemm = TfidfVectorizer(analyzer=clean_lemma)

textfeatures=vectstem.fit_transform(comments['text'])
print("Stemmed - " + str(len(vectstem.get_feature_names())))

vectlemm.fit_transform(comments['text'])
print("Lemmatized - " + str(len(vectlemm.get_feature_names())))

Stemmed - 2297
Lemmatized - 2756


In [11]:
textmatrix = pd.DataFrame(textfeatures.toarray(),columns=vectstem.vocabulary_)
sum_scores = pd.DataFrame(textmatrix.sum(),columns=['sum_scores_TFIDF'])
sum_scores.sort_values(by='sum_scores_TFIDF',ascending=True)[:5] 

Unnamed: 0,sum_scores_TFIDF
screech,0.058194
smart,0.058194
occasion,0.058194
happend,0.058194
muchi,0.058194


In [12]:
sum_scores.sort_values(by='sum_scores_TFIDF',ascending=False)[:5]

Unnamed: 0,sum_scores_TFIDF
femal,50.284323
cheaper,46.950293
spray,33.324485
34overcook34,31.898097
except,27.724263


In [13]:
pd.set_option('display.max_colwidth', 0)
comments.head()

Unnamed: 0,text,rating,recommend,vine
0,Works great!. Works great,5,,N
4,I just got this last week. I think's great. The cord length is perfect.. I think's great. The cord length is perfect,4,,N
23,"After using the same blowdryer for probably 8-10years, I decided to upgrade to a new one. I have very thick (*VERY THICK!*) moderately wavy shoulder length hair. Due to my hair thickness, drying times were typically 30-45minutes. I wish I was joking. This dryer dries completely in under 15 minutes (about 10-12minutes more precisely). I am so thrilled with this dryer!! Can't say enough good things about it! I like the texture of the dryer itself, love the speed of drying and the cord is long enough to actually be able to use it! SCORE!. I am obsessed",5,,N
50,The ends are pink. Not what I expected. Three Stars,3,,N
79,"I was hoping for a quieter blow dryer. This is definitely quieter than my other dryer, but still not quiet. I also read all kinds of reviews that said this dries your hair so it's smooth and silky. The only thing that makes my hair smooth and silky is product!. Not as Quiet as the Reviews said",3,,N


In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

happy = "I am Happy. this is so awesome. I love life. I will be in heaven"
#when you find free food in university
print("happy " + str(sid.polarity_scores(text)))


sad = "i hate this. I am mad this is stupid. I will kill you"
#when your professor gives you a ZERO in assignment
print("sad " + str(sid.polarity_scores(sad)))

neut = "I will come. You should go. This is blue color"
#when you state facts and nothing else
print("dont care - " + str(sid.polarity_scores(neut)))

srishti = "money"
print("dss - " + str(sid.polarity_scores(srishti)))

happy {'neg': 0.0, 'neu': 0.483, 'pos': 0.517, 'compound': 0.9522}
sad {'neg': 0.714, 'neu': 0.286, 'pos': 0.0, 'compound': -0.9432}
dont care - {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
dss - {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [16]:
# Feature 1 : Sentiment compound value
def sentiment(x):
    score = sid.polarity_scores(x)
    return score['compound']
    
#sentiment(happy)
comments['sentiment']= comments['text'].apply(lambda x : sentiment(x))

# Feature 2 : Length of string

comments['length'] = comments['text'].apply(lambda x : len(re.split('\W+',x)))
comments[comments['rating']==5].head(10)

# before we proceed - we need to convert all true >> 1 and false as 0
def convert(x):
    
    if x==True:
        return 1
    else :
        return 0
    
print(convert("False"))

comments['target_rec'] = comments['recommend'].apply(lambda x : convert(x))
comments.head(5)

comments[comments['rating']==1].head(5)


0


Unnamed: 0,text,rating,recommend,vine,sentiment,length,target_rec
80,"Have had it for about 4 years and thought it did a good enough job. I use it on medium heat/blow settings and always clean the air intake screen at the first sign of lint. However, it has recently been throwing out sparks and some actual flames. At first it was a high pitched screeching noise, then popping, then sparks, now FLAMES! It is in the trash and I will be looking for a better dryer. Funny that my mother has had the same dryer for over 15 years, but this modern &#34;high tech&#34; stuff is just mass produced junk!. Dangerous, shoots SPARKS and FLAMES!",1,0.5,N,0.3455,109,0
100,Over heated in 2 minute! We had to return it and had to pay bubble rap to sent it back to Amazon. Good luck if you want to order this through online.. Overheated in 2 minutes.,1,,N,0.7777,37,0
282,Please don't buy this. I used this for just 3 or 4 times and now started getting coil burning smell when I turn it on. It should go to trash now.. Please don't buy this item,1,1.0,N,0.5574,38,1
338,Problems using with uk adaptor. One Star,1,,N,-0.4019,7,0
368,This item is pink and it is not clearly advertised as such.. One Star,1,0.0,N,-0.3089,14,0


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

# need to reset index of the comments column to match with textfeatures
new_sentiment = comments.sentiment.reset_index()['sentiment']
new_length = comments.length.reset_index()['length']

x_features = pd.concat([new_sentiment,new_length,
                        pd.DataFrame(textfeatures.toarray(),
                        columns=vectstem.vocabulary_)],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x_features,comments.target_rec,test_size=0.2)

rf = RandomForestClassifier(n_jobs=-1,n_estimators=50,max_depth=90)
rfmodel=rf.fit(x_train,y_train)

y_pred = rfmodel.predict(x_test)
sorted(zip(rfmodel.feature_importances_,x_train.columns),reverse=True)[0:10]

[(0.013444908861393967, 'proair'),
 (0.009666167408930611, 'wtih'),
 (0.009139098324525936, 'drybr'),
 (0.009024204624359228, '1'),
 (0.008940957318066194, 'length'),
 (0.008765456668053591, 'ereal'),
 (0.008748107470881582, 'leakag'),
 (0.008025350057453296, 'placement'),
 (0.007640325527989937, 'cheaper'),
 (0.007262370959015938, 'sadli')]

In [18]:
precision, recall, fscore , support = score(y_test,y_pred,average='binary')
print('Precision: {} / Recall :{} / Accuracy {} '.format(round(precision,3),
                                                         round(recall,3),
                                                         round((y_pred==y_test).sum()/len(y_test),3)))

Precision: 0.0 / Recall :0.0 / Accuracy 0.898 


  'precision', 'predicted', average, warn_for)
