### CountVectorizer를 통한 벡터화

In [2]:
import pandas as pd
import numpy as np

DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
reviews = list(train_data['review'])
y = np.array(train_data['sentiment'])  # 전처리가 완료된 후 레이블로 사용

In [6]:
print(train_data.shape)
train_data.head()

(25000, 2)


Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)
#  단위 :단어 / 각 벡터의 최대 길이를 5000으로 설정

train_data_features = vectorizer.fit_transform(reviews)

In [9]:
train_data_features
# 25000 개의 데이터가 각각 5000개의 특징값을 가지는 벡터로 표현돼 있다는 것

<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1975048 stored elements in Compressed Sparse Row format>

### 데이터 분리

In [10]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2
RANDOM_SEED = 42

train_input, eval_input, train_label, eval_label = train_test_split(train_data_features, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

### 모델 생성 및 학습

In [11]:
from sklearn.ensemble import RandomForestClassifier

# 100개의 의사결정 트리 사용
forest = RandomForestClassifier(n_estimators=100)

# 학습
forest.fit(train_input, train_label)

RandomForestClassifier()

### 성능 평가

In [12]:
print("Accuracy: %f" % forest.score(eval_input, eval_label))

Accuracy: 0.846200


### 데이터 제출

In [18]:
TEST_CLEAN_DATA = 'test_clean.csv'
DATA_OUT_PATH = './data_out/'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_reviews = list(test_data['review'])
ids = list(test_data['id'])

In [19]:
print(test_data.shape)
test_data.head()

(25000, 2)


Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""


In [20]:
# 리스트로 만든 문장을 벡터화
test_data_features = vectorizer.transform(test_reviews)

In [23]:
test_data_features

<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1927902 stored elements in Compressed Sparse Row format>

In [24]:
result = forest.predict(test_data_features)

output = pd.DataFrame(data={'id':ids, 'sentiment':result})

output.to_csv(DATA_OUT_PATH + "CV_RF_model.csv", index=False, quoting=3)