# Sentiment Analysis

## Exploratory Data Analysis and Preprocessing

In [1]:
import pandas as pd
import os
import re

In [2]:
path = "/Users/yqz088/Downloads/ml/dataset"

train_neg = os.path.join(path, "train/neg")
train_pos = os.path.join(path, "train/pos")

test_neg = os.path.join(path, "test/neg")
test_pos = os.path.join(path, "test/pos")

directory1 = [train_neg, train_pos]
directory2 = [test_neg, test_pos]

def format_data(directories):
    text_id = []
    text = []
    category = []
    score = []
        
    for directory in directories:
        files = os.listdir(directory)
        files.sort(key=lambda x: int(x.split('_')[0]))

        for file in files:
            if directory.endswith("neg"):
                id = "neg_" + file
                cat = "negative"
                degree = re.search("neg_[0-9]+_([0-9])", id)
            else:
                id = "pos_" + file
                cat = "positive"
                degree = re.search("pos_[0-9]+_([0-9]+)", id)

            path = os.path.join(directory, file)
            fhand = open(path)
            inp = fhand.read()

            text_id.append(id)
            text.append(inp)
            category.append(cat)
            score.append(degree.group(1))

    data = {"id": text_id, "review": text, "sentiment": category, "score": score}

    return data
        
train_formated = format_data(directory1) 
test_formated = format_data(directory2) 

df_train = pd.DataFrame(train_formated) 
df_test = pd.DataFrame(test_formated) 

df_train.set_index('id', inplace=True)
df_test.set_index('id', inplace=True)

# df_train.head()
# df_test.head()

In [3]:
df_train.sentiment.value_counts()

positive    12500
negative    12500
Name: sentiment, dtype: int64

In [4]:
df_train.score.value_counts()

1     5100
10    4732
8     3009
4     2696
7     2496
3     2420
2     2284
9     2263
Name: score, dtype: int64

In [5]:
possible_labels = df_train.sentiment.unique()

In [6]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [7]:
label_dict

{'negative': 0, 'positive': 1}

In [8]:
df_train['label'] = df_train.sentiment.replace(label_dict)

In [9]:
df_train.head()

Unnamed: 0_level_0,review,sentiment,score,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
neg_0_3.txt,Story of a man who has unnatural feelings for ...,negative,3,0
neg_1_1.txt,Robert DeNiro plays the most unbelievably inte...,negative,1,0
neg_2_1.txt,"I saw the capsule comment said ""great acting.""...",negative,1,0
neg_3_4.txt,If I had not read Pat Barker's 'Union Street' ...,negative,4,0
neg_4_4.txt,This fanciful horror flick has Vincent Price p...,negative,4,0


In [10]:
df_test.sentiment.value_counts()

positive    12500
negative    12500
Name: sentiment, dtype: int64

In [11]:
df_test.score.value_counts()

1     5022
10    4999
8     2850
4     2635
3     2541
9     2344
7     2307
2     2302
Name: score, dtype: int64

In [12]:
possible_labels2 = df_test.sentiment.unique()

In [13]:
label_dict2 = {}
for index, possible_label in enumerate(possible_labels2):
    label_dict2[possible_label] = index

In [14]:
label_dict2

{'negative': 0, 'positive': 1}

In [15]:
df_test['label'] = df_test.sentiment.replace(label_dict2)

In [16]:
df_test.head()

Unnamed: 0_level_0,review,sentiment,score,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
neg_0_2.txt,Once again Mr. Costner has dragged out a movie...,negative,2,0
neg_1_3.txt,This is a pale imitation of 'Officer and a Gen...,negative,3,0
neg_2_3.txt,"It seems ever since 1982, about every two or t...",negative,3,0
neg_3_4.txt,"Wow, another Kevin Costner hero movie. Postman...",negative,4,0
neg_4_4.txt,"Alas, another Costner movie that was an hour t...",negative,4,0


In [17]:
df_train.to_csv("train.csv")
df_test.to_csv("test.csv")