### Import Library

In [1]:
# sys module
import nltk
from nltk.corpus import stopwords

import string

# third parties module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read Data

In [2]:
imdb = pd.read_csv ("data/imdb_labelled.txt", sep='\t', names=['comment', 'sentiment'], header=None)
amazon = pd.read_csv ("data/amazon_cells_labelled.txt", sep='\t', names=['comment', 'sentiment'], header=None)
yelp = pd.read_csv ("data/yelp_labelled.txt", sep='\t', names=['comment', 'sentiment'], header=None)

In [3]:
data = [imdb, amazon, yelp]

### Preprocessing

In [4]:
stopword = stopwords.words ('english')
def preprocessing (comment):
    # remove punctuation
    for p in string.punctuation:
        comment = comment.replace (p, ' ')  
    # tokenize
    words = set (nltk.word_tokenize (comment.lower ()))
    # remove char less than 3
    words = set ([w for w in words if len (w) > 3])  
    # remove stopwords
    words = words - set (stopword)
    
    return words

for x in data:
    x['word'] = x['comment'].apply (preprocessing)

### Feature Extraction

In [5]:
# feature extraction
def extract_feature (word):
    feature = {}
    for w in word:
        feature[w] = True
    return feature

for x in data:
    x['feature'] = x['word'].apply (extract_feature)

#### Visualization Function 

In [6]:
name = ['IMDb', 'Amazon', 'Yelp']

for i in range(len(data)):
    data[i]['name'] = name[i]

In [7]:
# bar chart function

def create_bar_chart(performance, materials, yLabel, chartTitle):
    x_pos = np.arange(len(materials))
    fig, ax = plt.subplots()
    bar_plot = ax.bar(x_pos, performance, align='center', alpha=0.5, ecolor='black', capsize=10, color='mcygb')
    for idx,rect in enumerate(bar_plot):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., 0.93*height, performance[idx], ha='center', va='bottom', rotation=0)  
    ax.set_ylabel(yLabel)
    ax.set_ylim(0,1)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(materials)
    ax.set_title(chartTitle)
    ax.yaxis.grid(True)

In [8]:
# bar chart with error bar function

def create_bar_with_error_chart(CTEs, error, materials, yLabel, chartTitle):
    x_pos = np.arange(len(CTEs))
    fig, ax = plt.subplots()
    bar_plot = ax.bar(x_pos, CTEs, yerr=error, align='center', alpha=0.5, ecolor='black', capsize=10, color='mcygb')
    for idx,rect in enumerate(bar_plot):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/1.2, 1.0*height, CTEs[idx], ha='center', va='bottom', rotation=0)  
    ax.set_ylabel(yLabel)
    ax.set_ylim(0,1)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(materials)
    ax.set_title(chartTitle)
    ax.yaxis.grid(True)