In [5]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Load Train Data set

In [6]:
AFTER_DATA_IN_PATH = './preprocessed_data/'

TRAIN_CLEAN_DATA = 'train_clean.csv'

In [7]:
train_data = pd.read_csv(AFTER_DATA_IN_PATH + TRAIN_CLEAN_DATA)

# CountVectorizer 기반 벡터화

In [8]:
reviews = list(train_data['review'])
y = np.array(train_data['sentiment'])

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)

In [11]:
train_data_features = vectorizer.fit_transform(reviews)

In [12]:
train_data_features

<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1975048 stored elements in Compressed Sparse Row format>

# Data Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
TEST_SIZE = 0.2
RANDOM_SEED = 42

In [17]:
train_input, eval_input, train_label, eval_label = train_test_split(train_data_features, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# MODELING

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
forest = RandomForestClassifier(n_estimators=100)

In [21]:
forest.fit(train_input, train_label)

In [22]:
print('Accuracy: %f' % forest.score(eval_input, eval_label))

Accuracy: 0.851200


# Load TEST Data set

In [23]:
TEST_CLEAN_DATA = 'test_clean.csv'

In [24]:
test_data = pd.read_csv(AFTER_DATA_IN_PATH + TEST_CLEAN_DATA)

In [25]:
test_reviews = list(test_data['review'])
ids = list(test_data['id'])

In [26]:
test_data_features = vectorizer.transform(test_reviews)

In [28]:
if not os.path.exists('./data_out'):
    os.makedirs('./data_out')

In [29]:
result = forest.predict(test_data_features)

In [30]:
print(result)

[1 0 1 ... 0 1 1]


In [31]:
output = pd.DataFrame(data={'id':ids, 'sentiment':result})

In [32]:
output.to_csv('./data_out/' + 'Bag_of_Words_randomforest_model.csv', index=False, quoting=3)