In [1]:
import pandas as pd
train = pd.read_csv('data/labeledTrainData.tsv', 
                    header=0, delimiter='\t', quoting=3)
test = pd.read_csv('data/testData.tsv', 
                   header=0, delimiter='\t', quoting=3)

<h1>전처리는 HTML만 제거</h1>

In [3]:
# from preprocessing import preprocessing
from bs4 import BeautifulSoup

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    return review_text

In [5]:
train['review_clean'] = [review_to_words(s) for s in train['review']]

In [7]:
test['review_clean'] = [review_to_words(s) for s in test['review']]

In [8]:
train['review_clean'][:10]

0    "With all this stuff going down at the moment ...
1    "\"The Classic War of the Worlds\" by Timothy ...
2    "The film starts with a manager (Nicholas Bell...
3    "It must be assumed that those who praised thi...
4    "Superbly trashy and wondrously unpretentious ...
5    "I dont know why people think this is such a b...
6    "This movie could have been very good, but com...
7    "I watched this video at a friend's house. I'm...
8    "A friend of mine bought this film for £1, and...
9    "This movie is full of references. Like \"Mad ...
Name: review_clean, dtype: object

In [12]:
x_train = train['review_clean']
x_test = test['review_clean']

<h1>TF-IDF</h1>

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from nltk.corpus import words
import nltk
nltk.download('words')

vectorizer = CountVectorizer(analyzer = 'word',
                            lowercase = True,
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = 'english',
                            min_df = 2, #토큰이 나타날 최소 문서 개수.
                            ngram_range=(1,3),
                            vocabulary = set(words.words()),
                            max_features = 90000)
vectorizer

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\trevor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=90000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary={'anthropotomical', 'diabolicalness', 'evilproof', 'parthenogeny', 'inadvertence', 'decahedral', 'irreverend', 'interopercle', 'guildhall', 'bellyflaught', 'Cayubaban', 'untrickable', 'hydranth', 'unchasteness', 'running', 'unfanatical', 'vagabondismus', 'aswoon', 'Horatio', 'countertail'...cosovereign', 'pantler', 'forecited', 'enanthema', 'coercionary', 'kerflap', 'porrect', 'dyingness'})

In [11]:
# TfidfTransformer() -> norm = ('L1', 'L2')가 있고. L2가 디폴트. L2 : 벡터의 각 원소의 제곱의 합이 1이 되도록 만드는것. L1은 벡터의 각 원소의 절댓값의 합이 1
# smooth_idf = false : 피처를 만들 때 0으로 나오는 항목에 대해 작은 값을 더해서(스무딩을 해서) 피처를 만들지 아니면 그냥 생성할지.
# use_idf = True : TF를 사용할지 아니면 TFIDF를 사용할지.

pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', TfidfTransformer(smooth_idf = False)),
])
pipeline

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=90000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
       ...('tfidf', TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True))])

In [13]:
x_train_tfidf_vector = pipeline.fit_transform(x_train)

  idf = np.log(float(n_samples) / df) + 1.0


In [14]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

235892


['A',
 'Aani',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'Ababdeh']

In [15]:
x_test_tfidf_vector = pipeline.fit_transform(x_test)

  idf = np.log(float(n_samples) / df) + 1.0


In [16]:
import numpy as np
dist = np.sum(x_train_tfidf_vector, axis=0)

for tag, count in zip(vocab,dist):
    print(tag, count)

pd.DataFrame(dist, columns=vocab)

A [[ 0.  0.  0. ...,  0.  0.  0.]]


Unnamed: 0,A,Aani,Aaron,Aaronic,Aaronical,Aaronite,Aaronitic,Aaru,Ab,Ababdeh,...,zymotechnical,zymotechnics,zymotechny,zymotic,zymotically,zymotize,zymotoxic,zymurgy,zythem,zythum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 100, n_jobs=-1, random_state=2018)
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [18]:
forest = forest.fit(x_train_tfidf_vector, train['sentiment'])

<h1>Cross Validation 교차 검증</h1>

In [19]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=5, shuffle=True, random_state=2018)

score = np.mean(cross_val_score(forest, x_train_tfidf_vector, train['sentiment'], cv = k_fold, scoring = 'roc_auc', n_jobs=-1))

In [20]:
'{:,.5f}'.format(score)

'0.92068'

In [22]:
rst = forest.predict(x_test_tfidf_vector)

In [23]:
rst[:10]

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [24]:
output = pd.DataFrame(data = {'id':test['id'], 'sentiment':rst})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",0
3,"""7186_2""",0
4,"""12128_7""",1


In [25]:
output_sentiment = output['sentiment'].value_counts()
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

-376


1    12688
0    12312
Name: sentiment, dtype: int64

In [26]:
output.to_csv('data/tutorial_4_tfidf_{0:.5f}.csv'.format(score), index=False, quoting=3)