In [2]:
# DataFrame
import pandas as pd

In [3]:
dataset_path = "dataset/training.1600000.processed.noemoticon.csv"
print("Open file:", dataset_path)
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(dataset_path, encoding="ISO-8859-1" , names=DATASET_COLUMNS)
print("Dataset size:", len(df))

Open file: dataset/training.1600000.processed.noemoticon.csv
Dataset size: 1600000


In [37]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [38]:
from collections import Counter
train_size, test_size = 40000, 10000
train = df[:train_size].append(df[800000 : 800000 + train_size])
test = df[train_size : train_size + test_size].append(df[800000 + train_size:800000 + train_size + test_size])

print(Counter(train.target))
print(Counter(test.target))

Counter({0: 40000, 4: 40000})
Counter({0: 10000, 4: 10000})


In [39]:
train.text = train.text.apply(lambda x: preprocess(x))
test.text = test.text.apply(lambda x: preprocess(x))

In [40]:
X_train = train["text"].tolist() 
y_train = train["target"].tolist() 
X_test = test["text"].tolist() 
y_test = test["target"].tolist() 

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
print(vectorizer.get_feature_names())
X_train = X_train.toarray()
print(X_train.shape)

full_vocab = vectorizer.vocabulary_
vectorizer_test = CountVectorizer(vocabulary=full_vocab)
X_test = vectorizer_test.fit_transform(X_test)
X_test = X_test.toarray()
print(X_test.shape)




(80000, 47548)
(20000, 47548)


In [42]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(X_test.shape[0], (y_test == y_pred).sum())

20000 10415


In [32]:
# y_pred = gnb.predict(X_test)
print(X_test.shape[0], (y_test == y_pred).sum())

20000 10415
