In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import glob
import os
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

In [104]:
train_df = pd.read_csv("train_master.tsv", delimiter='\t')

In [105]:
train_df.head()

Unnamed: 0,file_name,label
0,train_0000.txt,0
1,train_0001.txt,0
2,train_0002.txt,1
3,train_0003.txt,1
4,train_0004.txt,0


In [106]:
train_df["txt"] = 0
i = 0
train_files = glob.glob("./train/*.txt")
for file in train_files:
    bfilename = os.path.basename(file)
    with open(file) as f:
        data = f.read()
        train_df.loc[i, "txt"] = data
    i += 1
print(train_df.head())


        file_name  label                                                txt
0  train_0000.txt      0  Subject: re : buyback / deficiency deals works...
1  train_0001.txt      0  Subject: fw : stress relief\n- - - - - origina...
2  train_0002.txt      1  Subject: from mrs . juliana\ndear friend ,\npl...
3  train_0003.txt      1  Subject: [ wrenches ] 68 % off dreamweaver mx ...
4  train_0004.txt      0  Subject: y 2 k - texas log\nname home pager\ng...


In [107]:
# test
test_files = glob.glob("./test/*.txt")
test_df = pd.read_csv("./sample_submit.csv", names=('file_name', 'label'))
test_df["txt"] = 0
i = 0
for file in test_files:
    bfilename = os.path.basename(file)
    with open(file) as f:
        data = f.read()
        test_df.loc[i, "txt"] = data
    i += 1
print(test_df.head())

       file_name  label                                                txt
0  test_0000.txt      1  Subject: join the thousands who are now sp @ m...
1  test_0001.txt      1  Subject: potential list fo 9 / 00\ndaren :\npe...
2  test_0002.txt      0  Subject: bounce skel @ iit . demokritos . gr :...
3  test_0003.txt      1  Subject: hpl meter # 981488 paris tenaska hpl\...
4  test_0004.txt      0  Subject: hpl nom for august 3 , 2000\n( see at...


In [108]:
# preprocess method
def to_lower(word):
    result = word.lower()
    return result

def remove_special_characters(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return ''.join(result)

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [109]:
# 小文字へ置換する
train_df["txt"] = train_df["txt"].map(to_lower)
test_df["txt"] = test_df["txt"].map(to_lower)

In [110]:
# 特殊文字の置換
train_df["txt"] = train_df["txt"].map(remove_special_characters)
test_df["txt"] = test_df["txt"].map(remove_special_characters)

In [111]:
# ストップワード削除
train_df["txt"] = train_df["txt"].map(remove_stop_words)
test_df["txt"] = test_df["txt"].map(remove_stop_words)

In [112]:
# URL削除
train_df["txt"] = train_df["txt"].map(remove_hyperlink)
test_df["txt"] = test_df["txt"].map(remove_hyperlink)

In [113]:
train_df.head()

Unnamed: 0,file_name,label,txt
0,train_0000.txt,0,subject re buybck defcency dels worksheet\nt...
1,train_0001.txt,0,subject fw stress relef\n orgnl messge ...
2,train_0002.txt,1,subject from mrs juln\nder frend \nplese don ...
3,train_0003.txt,1,subject wrenches 68 off dremwever mx 2004 f...
4,train_0004.txt,0,subject y 2 k texs log\nnme home pger\ngeorge...


In [114]:
vectorizer = CountVectorizer(min_df=3, stop_words='english')
vectorizer.fit(train_df['txt'])

print('Vocabulary size: {}'.format(len(vectorizer.vocabulary_)))
print('Vocabulary content: {}'.format(type(vectorizer.vocabulary_)))

Vocabulary size: 8109
Vocabulary content: <class 'dict'>


In [115]:
# 文章を特徴ベクトル化
X_train_bow = vectorizer.transform(train_df['txt'])
X_test_bow = vectorizer.transform(test_df['txt'])

print('X_train_bow:\n{}'.format(repr(X_train_bow)))
print('X_test_bow:\n{}'.format(repr(X_test_bow)))

X_train_bow:
<2586x8109 sparse matrix of type '<class 'numpy.int64'>'
	with 158555 stored elements in Compressed Sparse Row format>
X_test_bow:
<2586x8109 sparse matrix of type '<class 'numpy.int64'>'
	with 150156 stored elements in Compressed Sparse Row format>


In [116]:
print(len(vectorizer.vocabulary_))
#print(dict(list(vectorizer.vocabulary_.items())))
#print(sorted(dict(list(vectorizer.vocabulary_.items()))), key=lambda x:x[0])

8109


In [117]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_bow, train_df.loc[:,"label"])
print('Train accuracy: {:.3f}'.format(model.score(X_train_bow, train_df.loc[:,"label"])))

Train accuracy: 0.972


In [118]:
pred = model.predict(X_test_bow)

In [119]:
test_files = glob.glob("./test/*.txt")
submit_df = pd.DataFrame(columns=["file", "label"])
for i, file in enumerate(test_files):
    tmp_se = pd.Series([os.path.basename(file), pred[i]])
    submit_df.loc[i, "file"] = os.path.basename(file)
    submit_df.loc[i, "label"] = pred[i]
submit_df.head()

Unnamed: 0,file,label
0,test_0000.txt,1
1,test_0001.txt,0
2,test_0002.txt,1
3,test_0003.txt,0
4,test_0004.txt,0


In [120]:
submit_df.to_csv("./submit_bow_MultinomialNB_preproc.csv", header=None, index=0)