In [1]:
import os
import re
import pickle

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import SnowballStemmer
from nltk import word_tokenize as nltk_wtknz

In [2]:
# df = pd.read_csv("../data/mtv_test_2.csv", sep="|")

In [3]:
BASE_DIR = '../'
TEXT_DATA_DIR = BASE_DIR + 'data/test2_for_mtv'
TEXT_DATA_FILE_1 = "rt-polarity_neg.txt"
TEXT_DATA_FILE_2 = "rt-polarity_pos.txt"
HEADER = True

In [4]:
def load_data():
    x = []
    y = []
    for i in [TEXT_DATA_FILE_1, TEXT_DATA_FILE_2]:
        with open(os.path.join(TEXT_DATA_DIR, i), "r", encoding='utf-8', errors='ignore') as f:
            if HEADER:
                _ = next(f)
            if i[-7:-4] == "pos":
                temp_y = 1
            else: temp_y = 0
            
            for line in f:
                x.append(line.rstrip("\n"))
                y.append(temp_y)
    return x, y

In [5]:
data, labels = load_data()

In [6]:
df = pd.DataFrame({'label': labels, 'text': data})

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10660 entries, 0 to 10659
Data columns (total 2 columns):
label    10660 non-null int64
text     10660 non-null object
dtypes: int64(1), object(1)
memory usage: 166.6+ KB


In [8]:
df.describe()

Unnamed: 0,label
count,10660.0
mean,0.5
std,0.500023
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [9]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk_wtknz(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [10]:
print(df.shape)

(10660, 2)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10660 entries, 0 to 10659
Data columns (total 2 columns):
label    10660 non-null int64
text     10660 non-null object
dtypes: int64(1), object(1)
memory usage: 166.6+ KB


In [12]:
df['label'].value_counts(normalize=True)

1    0.5
0    0.5
Name: label, dtype: float64

In [13]:
with open('../dumps/m_lin_svc_mtv_100_out.pkl', 'rb') as f:
    model = pickle.load(f)

In [14]:
y_predicted = model.predict(df['text'])

In [15]:
print(accuracy_score(df['label'], y_predicted))
print(classification_report(df['label'], y_predicted))

0.610131332083
             precision    recall  f1-score   support

          0       0.93      0.24      0.38      5330
          1       0.56      0.98      0.72      5330

avg / total       0.75      0.61      0.55     10660

