In [11]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import gc
import numpy as np
import scipy
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pickle
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaopanzhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
dataset_path = "dataset/training.1600000.processed.noemoticon.csv"
print("Open file:", dataset_path)
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(dataset_path, encoding="ISO-8859-1" , names=DATASET_COLUMNS)
print("Dataset size:", len(df))

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


Open file: dataset/training.1600000.processed.noemoticon.csv
Dataset size: 1600000


In [13]:
from collections import Counter

train_size = 100
num_iter = df.shape[0] // train_size // 2

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

df.text = df.text.apply(lambda x: preprocess(x))
X = df["text"]
y = df["target"]

X, y = shuffle(X, y)

X = X.tolist() 
y = y.tolist() 

del df
gc.collect()

70

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)
full_vocab = vectorizer.vocabulary_
vectorizer_test = CountVectorizer(vocabulary=full_vocab)

acc_list = []

for i in range(num_iter- 1):

    train_starting_index = i * train_size
    train_end_index = (i + 1) * train_size

    X_train = scipy.sparse.vstack((X[train_starting_index : train_end_index], X[800000 + train_starting_index : 800000 + train_end_index]))
    
    X_train = X_train.toarray()
    y_train = y[train_starting_index : train_end_index] + y[800000 + train_starting_index : 800000 + train_end_index]

    X_test = scipy.sparse.vstack((X[train_starting_index + train_size : train_end_index + train_size], X[800000 + train_starting_index + train_size : 800000 + train_end_index + train_size]))
    X_test = X_test.toarray()
    y_test = y[train_starting_index + train_size : train_end_index + train_size] + y[800000 + train_starting_index + train_size : 800000 + train_end_index + train_size]

    if i == 0:
        y_pred = gnb.partial_fit(X_train, y_train, [0, 4]).predict(X_test)
    else:
        y_pred = gnb.partial_fit(X_train, y_train).predict(X_test)

    correct_num = (y_test == y_pred).sum()
    acc = correct_num / (train_size * 2) 
    print((train_size * 2), correct_num, acc)

    acc_list.append(acc)

    with open('acc_list.pkl', 'wb') as handle:
        pickle.dump(acc_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

    plt.plot(acc_list)
    plt.xlabel('iteration')
    plt.ylabel('accuracy (%)')
    plt.savefig('accuracy.png')
    plt.clf()


200 113 0.565
200 117 0.585
200 125 0.625
200 121 0.605
200 112 0.56
200 107 0.535
200 105 0.525
200 130 0.65
200 94 0.47
200 115 0.575
200 127 0.635
200 115 0.575
200 109 0.545
200 101 0.505
200 115 0.575
200 113 0.565
200 112 0.56
200 113 0.565
200 117 0.585
200 116 0.58
200 112 0.56
200 122 0.61
200 97 0.485
200 113 0.565
200 120 0.6
200 98 0.49
200 102 0.51
200 115 0.575
200 110 0.55
200 121 0.605
200 102 0.51
200 113 0.565
200 100 0.5
200 128 0.64
200 97 0.485
200 110 0.55
200 112 0.56
200 120 0.6
200 103 0.515
200 114 0.57
200 111 0.555
200 109 0.545
200 123 0.615
200 108 0.54
200 96 0.48
200 110 0.55
200 117 0.585
200 115 0.575
200 109 0.545
200 113 0.565
200 121 0.605
200 120 0.6
200 104 0.52
200 112 0.56
200 117 0.585
200 105 0.525
200 111 0.555
200 113 0.565
200 106 0.53
200 114 0.57
200 111 0.555
200 116 0.58
200 100 0.5
200 108 0.54
200 99 0.495
200 98 0.49
200 117 0.585
200 105 0.525
200 100 0.5
200 100 0.5
200 102 0.51
200 101 0.505
200 113 0.565
200 101 0.505
200 109 0.5

200 108 0.54
200 100 0.5
200 106 0.53
200 113 0.565
200 119 0.595
200 93 0.465
200 105 0.525
200 105 0.525
200 106 0.53
200 115 0.575
200 110 0.55
200 106 0.53
200 101 0.505
200 97 0.485
200 103 0.515
200 108 0.54
200 111 0.555
200 110 0.55
200 99 0.495
200 100 0.5
200 99 0.495
200 100 0.5
200 115 0.575
200 108 0.54
200 109 0.545
200 117 0.585
200 113 0.565
200 105 0.525
200 112 0.56
200 111 0.555
200 97 0.485
200 92 0.46
200 111 0.555
200 101 0.505
200 111 0.555
200 102 0.51
200 119 0.595
200 111 0.555
200 123 0.615
200 103 0.515
200 107 0.535
200 110 0.55
200 99 0.495
200 113 0.565
200 106 0.53
200 107 0.535
200 104 0.52
200 103 0.515
200 104 0.52
200 102 0.51
200 107 0.535
200 115 0.575
200 94 0.47
200 99 0.495
200 104 0.52
200 117 0.585
200 98 0.49
200 109 0.545
200 95 0.475
200 112 0.56
200 101 0.505
200 118 0.59
200 105 0.525
200 100 0.5
200 107 0.535
200 102 0.51
200 97 0.485
200 116 0.58
200 93 0.465
200 108 0.54
200 106 0.53
200 111 0.555
200 100 0.5
200 101 0.505
200 122 0.61

200 106 0.53
200 109 0.545
200 107 0.535
200 105 0.525
200 102 0.51
200 104 0.52
200 100 0.5
200 98 0.49
200 103 0.515
200 96 0.48
200 97 0.485
200 103 0.515
200 100 0.5
200 111 0.555
200 106 0.53
200 121 0.605
200 113 0.565
200 106 0.53
200 98 0.49
200 104 0.52
200 107 0.535
200 106 0.53
200 110 0.55
200 90 0.45
200 98 0.49
200 94 0.47
200 98 0.49
200 116 0.58
200 98 0.49
200 100 0.5
200 111 0.555
200 105 0.525
200 110 0.55
200 107 0.535
200 108 0.54
200 110 0.55
200 104 0.52
200 99 0.495
200 100 0.5
200 111 0.555
200 95 0.475
200 109 0.545
200 117 0.585
200 112 0.56
200 104 0.52
200 111 0.555
200 110 0.55
200 101 0.505
200 98 0.49
200 102 0.51
200 109 0.545
200 95 0.475
200 99 0.495
200 119 0.595
200 106 0.53
200 110 0.55
200 100 0.5
200 122 0.61
200 119 0.595
200 109 0.545
200 96 0.48
200 113 0.565
200 99 0.495
200 110 0.55
200 107 0.535
200 112 0.56
200 105 0.525
200 95 0.475
200 108 0.54
200 116 0.58
200 109 0.545
200 105 0.525
200 104 0.52
200 111 0.555
200 109 0.545
200 99 0.495