# Sentiment Analyzer

In [41]:
import pandas as pd

In [42]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [43]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [45]:
transformed = vectorizer.transform(sentences).toarray()

In [46]:
transformed


array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [47]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
   sentences, y, test_size=0.25, random_state=1000)

In [48]:
len(sentences_train)

750

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [50]:
X_test

<250x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 2069 stored elements in Compressed Sparse Row format>

1714 words available
each sentence uses some of them
sentence = [0,0,0,0,1,0,0,0,1,0,0......]

In [51]:
from sklearn.linear_model import LogisticRegression

# S shape sigmoid   
#.       ______
#.      |
#  _____

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


In [52]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [53]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


In [54]:
sentences = ['I love playing FIFA','I love chocolate','the weather is cold']
new_sentences = vectorizer.transform(['I love playing FIFA','I love chocolate','the weather is cold'])
predict=classifier.predict(new_sentences)
predict

array([1, 1, 0])

In [55]:
vectorizer.transform(sentences).toarray()
predict = classifier.predict(new_sentences)
for i in range(len(predict)):
    if predict[i] == 1:
        print(f'{sentences[i]} is a good feedback')
    else:
        print(f'{sentences[i]} is a bad feedback')

predict        


I love playing FIFA is a good feedback
I love chocolate is a good feedback
the weather is cold is a bad feedback


array([1, 1, 0])