In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import gc
import numpy as np
import scipy
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pickle
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaopanzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
dataset_path = "dataset/training.1600000.processed.noemoticon.csv"
print("Open file:", dataset_path)
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(dataset_path, encoding="ISO-8859-1" , names=DATASET_COLUMNS)
print("Dataset size:", len(df))

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    #Convert www.* or https?://* to URL
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
    
    #Convert @username to __USERHANDLE
    text = re.sub('@[^\s]+','__USERHANDLE',text)  
    
    #Replace #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
    
    #trim
    text = text.strip('\'"')
    
    # Repeating words like hellloooo
    repeat_char = re.compile(r"(.)\1{1,}", re.IGNORECASE)
    text = repeat_char.sub(r"\1\1", text)
    
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

Open file: dataset/training.1600000.processed.noemoticon.csv
Dataset size: 1600000


In [3]:
train_size = 2000
num_iter = df.shape[0] // train_size // 2

gnb = MultinomialNB()

df.text = df.text.apply(lambda x: preprocess(x))
X = df["text"]
y = df["target"]

X, y = shuffle(X, y)

X = X.tolist() 
y = y.tolist() 

del df
gc.collect()

10

In [None]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True,use_idf = True,ngram_range=(1, 2))
X = vectorizer.fit_transform(X)
full_vocab = vectorizer.vocabulary_
# vectorizer_test = TfidfVectorizer(vocabulary=full_vocab)

acc_list = []

for i in range(num_iter - 1):

    train_starting_index = i * train_size
    train_end_index = (i + 1) * train_size

    X_train = scipy.sparse.vstack((X[train_starting_index : train_end_index], X[800000 + train_starting_index : 800000 + train_end_index]))
    
    X_train = X_train.toarray()
    y_train = y[train_starting_index : train_end_index] + y[800000 + train_starting_index : 800000 + train_end_index]

    X_test = scipy.sparse.vstack((X[train_starting_index + train_size : train_end_index + train_size], X[800000 + train_starting_index + train_size : 800000 + train_end_index + train_size]))
    X_test = X_test.toarray()
    y_test = y[train_starting_index + train_size : train_end_index + train_size] + y[800000 + train_starting_index + train_size : 800000 + train_end_index + train_size]

    if i == 0:
        y_pred = gnb.partial_fit(X_train, y_train, [0, 4]).predict(X_test)
    else:
        y_pred = gnb.partial_fit(X_train, y_train).predict(X_test)
    
    y_pred = gnb.fit(X_train, y_train).predict(X_test)
    correct_num = (y_test == y_pred).sum()
    acc = correct_num / (train_size * 2) 
    print((train_size * 2), correct_num, acc)

    acc_list.append(acc)

    with open('acc_list.pkl', 'wb') as handle:
        pickle.dump(acc_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

    plt.plot(acc_list)
    plt.xlabel('iteration')
    plt.ylabel('accuracy (%)')
    plt.savefig('accuracy.png')
    plt.clf()

4000 2799 0.69975
4000 2880 0.72
4000 2820 0.705
4000 2787 0.69675
4000 2781 0.69525
4000 2813 0.70325
4000 2846 0.7115
4000 2842 0.7105
4000 2806 0.7015
4000 2794 0.6985
4000 2789 0.69725
4000 2798 0.6995
4000 2811 0.70275
4000 2838 0.7095
4000 2830 0.7075
4000 2801 0.70025
4000 2771 0.69275
4000 2834 0.7085
4000 2795 0.69875
4000 2801 0.70025
