In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
%config IPCompleter.greedy=True

### Dataset Information
We have dataset **train.tsv, test.tsv** in folder ** movies_review**. This dataset is used to predict the feedback of customer on a film. Here is some feedback label:
* 0 - negative
* 1 - somewhat negative
* 2 - neutral
* 3 - somewhat positive
* 4 - positive
### Requirement
1. Read dataset and do basic analysis
2. Create wordclound with Positive and Negative review. Print the the list of 20 words that have hight weight and visualize them
3. Filter data and get the pharse that have over 50 characters and divide them into 2 dataset source and target
4. Choose method to standardize data and visualize them with dataset on #3

In [None]:
dataset = pd.read_csv("data/movie_review/train.tsv", sep="\t")

In [None]:
dataset.head()

In [None]:
df_text = dataset[['Phrase', 'Sentiment']]

In [None]:
df_text.Phrase = df_text.Phrase.str.lower()

In [None]:
df_text.head()

In [None]:
df_text.info()

In [None]:
df_by_sentiment = df_text.groupby(by="Sentiment").count()
df_by_sentiment.Phrase

In [None]:
sb.barplot(df_by_sentiment.index, df_by_sentiment.Phrase)

In [None]:
positive_text = df_text[df_text.Sentiment == 4]
negative_text = df_text[df_text.Sentiment == 0]

In [None]:
positive_text.info()

In [None]:
negative_text.info()

In [None]:
positive_text.head()

In [None]:
positive_str = positive_text.Phrase.to_string(index=False)
positive_str = positive_str.replace("\n", " ")
negative_str = negative_text.Phrase.to_string(index=False)
negative_str = negative_str.replace("\n", " ")

In [None]:
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 

In [None]:
positive_wc = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = set(STOPWORDS), 
                min_font_size = 10)

In [None]:
positive_wc.generate(positive_str)
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(positive_wc) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
positive_ser = pd.Series(positive_wc.words_)
top_20_positive_word = positive_ser[0:19]
top_20_positive_word.index

In [None]:
plt.figure(figsize=(12,8))
plt.barh(top_20_positive_word.index, top_20_positive_word.values)

In [None]:
negative_wc = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = set(STOPWORDS), 
                min_font_size = 10)
negative_wc.generate(negative_str)
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(negative_wc) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
negative_ser = pd.Series(negative_wc.words_)
top_20_negative_word = negative_ser[0:19]
top_20_negative_word

In [None]:
plt.figure(figsize=(12,8))
plt.barh(top_20_negative_word.index, top_20_negative_word.values)

In [None]:
df_text.head()

In [None]:
df_text_len_over_50 = df_text[df_text.Phrase.str.len() > 50]

In [None]:
df_text_len_over_50.head()

In [None]:
source = df_text_len_over_50[['Phrase']]
target = df_text_len_over_50[['Sentiment']]

In [None]:
### Chuẩn hóa dùng CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
cv.fit(source.Phrase)

In [None]:
cv.get_feature_names()

In [None]:
pharse_tranform = cv.transform(source.Phrase)

In [None]:
pharse_tranform.toarray()

In [None]:
pharse_matrix = pd.DataFrame(pharse_tranform.toarray(), columns=cv.get_feature_names())

In [None]:
pharse_matrix.head()