In [54]:
import pandas as pd

# CountVectorizer 用來將文件轉換成 feature vector
from sklearn.feature_extraction.text import CountVectorizer

# cross_val_score 用來做 cross validation
from sklearn.model_selection import cross_val_score

# 底下模組用來處理文字資料
from bs4 import BeautifulSoup
import re

## 讀入資料集

In [55]:
train = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('testData.tsv', delimiter='\t')

In [56]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [58]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


In [59]:
def get_words(review):
    raw_text = BeautifulSoup(review, 'lxml').get_text()
    text = re.sub('[^a-zA-Z]', ' ', raw_text)
    words = text.lower().split()
    return words

## 將文字轉為特徵向量

In [60]:
vectorizer = CountVectorizer(analyzer='word', stop_words='english', tokenizer=get_words, lowercase=False)

In [61]:
X_train = vectorizer.fit_transform(train['review'])

In [62]:
X_test = vectorizer.transform(test['review'])

In [63]:
y_train = train['sentiment']

## 使用 Naive Bayes Classifier 來做分類

In [64]:
from sklearn.naive_bayes import MultinomialNB

In [65]:
mnb = MultinomialNB()

In [66]:
# 使用 10 fold cross validation 看看 model 的準確度怎樣
cross_val_score(mnb, X_train, y_train, cv=10).mean()

0.8556000000000001

In [67]:
# 訓練模族
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
# 預測結果
y_predict = mnb.predict(X_test)

In [47]:
submission_count = pd.DataFrame({'id':test['id'], 'sentiment':y_predict})

In [48]:
submission_count.to_csv('submission.csv', index=False)