In [1]:
import os
import numpy as np
import pandas as pd

import torch
import re
import random

from scipy.sparse import csr_matrix, vstack
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

In [2]:
SEED = 0
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

In [3]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

cuda:0


In [4]:
dev_text_file = 'dev_text.txt'
dev_label_file = 'dev_label.txt'
heldout_text_file = 'heldout_text.txt'
heldout_pred_file = 'heldout_pred_nb.txt'

In [5]:
with open(dev_label_file, 'rt', encoding='UTF8') as f:
    dev_label = np.asarray(f.readlines())
    f.close()
with open(dev_text_file, 'rt', encoding='UTF8') as f:
    dev_text = np.asarray(f.readlines())
    f.close()
with open(heldout_text_file, 'rt', encoding='UTF8') as f:
    heldout_text = np.asarray(f.readlines())
    f.close()
    
dev_label = [label.replace('\n', '') for label in dev_label]
dev_text = [text.replace('\n', '') for text in dev_text]
heldout_text = [text.replace('\n', '') for text in heldout_text]

In [6]:
def preprocess(data):
    word = [] #list of cleaned token 
    text = [] #list of cleaned text
    
    for line in data:
        content_text = re.sub(r'\([^)]*\)', '', line) 
        sent_text = sent_tokenize(content_text)

        normalized_text = []
        for string in sent_text:
            tokens = re.sub(r'[^A-Za-z0-9\s]+', '', string.lower())
            tokens = re.sub(r'\d+', '', tokens)
            normalized_text.append(tokens)

        result_content = ' '.join(normalized_text)
        result_sentence = [word_tokenize(sentence) for sentence in normalized_text]
        result = [word for sentence in result_sentence for word in sentence]

        word.append(result)
        text.append(result_content)
    
    return word, text

In [7]:
dev_word_list, dev_text_list = preprocess(dev_text)
heldout_word_list, heldout_text_list = preprocess(heldout_text)

In [8]:
#Data into pd.DataFrame format
Train_word = pd.DataFrame(columns=['label', 'word'])
Train_text = pd.DataFrame(columns=['label', 'text'])
for label, text, word in zip(dev_label, dev_text_list, dev_word_list):
    Train_word = Train_word.append([{'label': label, 'word': word}], ignore_index=True, sort=False)
    Train_text = Train_text.append([{'label': label, 'text': text}], ignore_index=True, sort=False)

Test_word = pd.DataFrame(columns=['label', 'word'])
Test_text = pd.DataFrame(columns=['label', 'text'])
for text, word in zip(heldout_text_list, heldout_word_list):
    Test_word = Test_word.append([{'label': None, 'word': word}], ignore_index=True, sort=False)
    Test_text = Test_text.append([{'label': None, 'text': text}], ignore_index=True, sort=False)

In [9]:
train_all_token_list = [token for text in Train_word['word'] for token in text]

In [10]:
#Build vocab from pos and neg
#5000 each, unique size is 6692
pos_token_list = [token for words in Train_word[Train_word['label']=='pos']['word'] for token in words]
pos_vocab = Counter(pos_token_list).most_common(5000)
pos_vocab = [c[0] for c in pos_vocab]

neg_token_list = [token for words in Train_word[Train_word['label']=='neg']['word'] for token in words]
neg_vocab = Counter(neg_token_list).most_common(5000)
neg_vocab = [c[0] for c in neg_vocab]

vocab = np.unique(pos_vocab+neg_vocab)
vocab_dict = dict(zip(vocab, range(len(vocab))))

In [11]:
vectorizer = TfidfVectorizer(vocabulary=vocab_dict, ngram_range=(1,1))
vectorizer.fit(train_all_token_list)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use...
                vocabulary={'a': 0, 'abandoned': 1, 'abc': 2, 'abducted': 3,
                            'ability': 4, 'able': 5, 'abomination': 6,
                            'about': 7, 'aboutbr': 8, 'above': 9, 'abr': 10,
                            'abroad': 11, 'abrupt': 12, 'absence': 13,
                            'absolute': 14, 'absolutely': 15, 'absorbing': 16,
                            'absurd': 17, 'absurdity': 18, 'abuse': 19,
                            'abused': 20, 'abusing': 21, 'abusive': 22,
          

In [12]:
Train_feature = vectorizer.transform(Train_text['text'])
Test_feature = vectorizer.transform(Test_text['text'])

In [13]:
train_data = []
for i in range(Train_feature.shape[0]):
    train_data.append([Train_text['label'].iloc[i], Train_feature.getrow(i)])
train_data = np.asarray(train_data)
np.random.shuffle(train_data)

In [14]:
train_set, validation_set = train_test_split(train_data, test_size=0.2, random_state=SEED)
test_set = Test_feature.copy()

In [15]:
#Random Sampling for train data
train_set_pos = train_set[train_set[:,0]=='pos']
train_set_neg = train_set[train_set[:,0]=='pos']
train_set_sampling = [train_set.copy()]
for i in range(10):
    pos_sample = train_set_pos[np.random.choice(range(len(train_set_pos)), 100),:]
    train_set_sampling.append(pos_sample)

    neg_sample = train_set_pos[np.random.choice(range(len(train_set_neg)), 100),:]
    train_set_sampling.append(neg_sample)
train_set_sampling = np.vstack(train_set_sampling)
np.random.shuffle(train_set_sampling)

In [16]:
y_train = train_set[:,0]
X_train = vstack(train_set[:,1]).toarray()
y_validation = validation_set[:,0]
X_validation = vstack(validation_set[:,1]).toarray()
X_test = test_set.toarray()

In [17]:
from sklearn.naive_bayes import MultinomialNB
skf = StratifiedKFold(n_splits=10)
params = {'alpha':[0.1, 0.5, 1.0, 2.0]}
nb = MultinomialNB()
gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=True)

In [18]:
nb.fit(X_train, y_train)
nb.score(X_validation, y_validation)

0.8275

In [19]:
gs.fit(X_train, y_train)
gs.score(X_validation, y_validation)

0.8525

In [20]:
predict = gs.predict(X_test)

In [21]:
with open(heldout_pred_file, 'w') as f:
    for l in predict:
        f.write(str(l)+'\n')
    f.close()