In [9]:

import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score




In [11]:
# read training file
def load_data(filepath):
    is_train = False
    id_label = []
    comment = []
    parent_comment = []
    with open(filepath) as f:
        header = f.readline()
        if header.strip().split()[0] == 'label':
            is_train = True
        for line in f:
            row = line.strip().split("\t")
            id_label.append(int(row[0]))
            comment.append(row[1])
            parent_comment.append(row[2])
    if is_train:
        return pd.DataFrame(data={'label':id_label, 'comment': comment, 'parent_comment': parent_comment})
    else:
        return pd.DataFrame(data={'id':id_label, 'comment': comment, 'parent_comment': parent_comment})

dataframe = load_data('../data/train.tsv')
dataframe = dataframe[["label", "comment"]]
dataframe.dropna(inplace=True)

train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)

# read test file
test_dataframe = load_data('../data/test.tsv')
test_dataframe = test_dataframe[["id", "comment"]]
test_dataframe.fillna("", inplace=True)

print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))
print('test set size:',len(test_dataframe))


# construct corpus
list_all_words = []
for i in dataframe.comment:
    words = word_tokenize(i)
    for word in words:
        list_all_words.append(word)

# construct TF-IDF matrix for training data
tfidf_vectorizer = TfidfVectorizer(input=list_all_words, lowercase=True, min_df=2, ngram_range=(1, 3))
tfidf_matrix_valid = tfidf_vectorizer.fit_transform(valid_dataframe.comment)
tfidf_matrix_train = tfidf_vectorizer.transform(train_dataframe.comment)
tfidf_matrix_test = tfidf_vectorizer.transform(test_dataframe.comment)

feature_names = tfidf_vectorizer.get_feature_names()

print(tfidf_matrix_train.shape)
print(tfidf_matrix_valid.shape)
print(tfidf_matrix_test.shape)

training set size: 42426
validation set size: 10606
test set size: 17719
(42426, 17746)
(10606, 17746)
(17719, 17746)


In [12]:
# construct labels
train_list = []
for i in train_dataframe.label:
    train_list.append(i)
    
valid_list = []
for i in valid_dataframe.label:
    valid_list.append(i)
y_train = pd.Series(train_list)
y_valid=pd.Series(valid_list)
# train Logistic Regression model
logmodel = LogisticRegression()
logmodel.fit(tfidf_matrix_train, y_train)

logit = LogisticRegression(C=1, n_jobs=4, solver='lbfgs', 
                           random_state=17, verbose=1)
# sklearn's pipeline
tfidf_logit_pipeline = Pipeline([('tf_idf',tfidf_vectorizer), 
                                 ('logit', logit)])
train_pred = logmodel.predict(tfidf_matrix_train)
accuracy_score(y_train, train_pred)

0.7645076132560222

In [13]:
valid_pred = logmodel.predict(tfidf_matrix_valid)
accuracy_score(y_valid, valid_pred)

0.6814067508957194

In [10]:
y_pred = logmodel.predict(tfidf_matrix_test)
pd.Series(y_pred).to_csv("LogisticRegression.csv", header=["label"], index_label="id")