In [1]:
import numpy as np
from scipy import sparse
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import sklearn.naive_bayes as sn

stop_words = set(stopwords.words('english') + list(string.punctuation))

def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g. 
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    # YOUR CODE HERE
    for word in nltk.word_tokenize(text):
        word = word.lower()
        # remove stop_words (commonly used meaningless words) and numbers
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)

    return tokens

def get_bagofwords(data, vocab_dict):
    '''
    :param data: a list of words, type: list
    :param vocab_dict: a dict from words to indices, type: dict
    return a word (sparse) matrix, type: scipy.sparse.csr_matrix
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.lil_matrix.html
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html
    '''
    # use of linked list to create spare matrix
    data_matrix = sparse.lil_matrix((len(data), len(vocab_dict)))

    # YOUR CODE HERE
    for i, doc in enumerate(data):
        for word in doc:
            # dict.get(key, -1)
            # if the word in the vocab_dic, return the value
            # else return -1
            word_idx = vocab_dict.get(word, -1)
            if word_idx != -1:
                data_matrix[i, word_idx] += 1

    # csr: row based format, better for matrix multiplication
    # to speed up when computing
    data_matrix = data_matrix.tocsr()

    return data_matrix

def read_data(file_name, vocab=None):
    """
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)
    df['words'] = df['text'].apply(tokenize)

    if vocab is None:
        vocab = set()
        for i in range(len(df)):
            for word in df.iloc[i]['words']:
                vocab.add(word)

    # dictionary of vocab : index_num
    vocab_dict = dict(zip(vocab, range(len(vocab))))

    data_matrix = get_bagofwords(df['words'], vocab_dict)

    return df['id'], df['label'], data_matrix, vocab


In [2]:
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
def read_word(file_name):
    df = pd.read_csv(file_name)
    df['words'] = df['text'].map(lambda x: x.lower()) 
    df['words'] = df['words'].str.replace('[^\w\s]', '')  
    df['words'] = df['words'].apply(nltk.word_tokenize)

    stemmer = PorterStemmer()

    df['words'] = df['words'].apply(lambda x: [stemmer.stem(y) for y in x])  
    # This converts the list of words into space-separated strings
    df['words'] = df['words'].apply(lambda x: ' '.join(x))

    count_vect = CountVectorizer()  
    counts = count_vect.fit_transform(df['words'])  
    transformer = TfidfTransformer().fit(counts)
    
    counts = transformer.transform(counts)
    return df, counts, transformer

In [5]:
from nltk.stem import PorterStemmer
df = pd.read_csv('Project 1 Sentiment Classification/data/train.csv')
df['words'] = df['text'].map(lambda x: x.lower()) 
df['words'] = df['words'].str.replace('[^\w\s]', '')  
df['words'] = df['words'].apply(tokenize)

stemmer = PorterStemmer()

# df['words'] = df['words'].apply(lambda x: [stemmer.stem(y) for y in x])  
# This converts the list of words into space-separated strings
df['words'] = df['words'].apply(lambda x: ' '.join(x))

print(df.shape)


(100000, 10)


In [23]:
i = 0
print(df['text'][0], " : ", df['stars'][0])

As a student, by back and neck are under constant strain and Dr Serrick manages to set them straight every time. I highly recommend the thumper treatment for really great muscle pain relief. Overall I would say that Dr Serrick is very knowledgeable, empathetic, and thorough. Highly recommended.  :  5.0


In [130]:
def modify_csv(fp, np): 
    d_f = pd.read_csv(fp)
    stars = d_f['stars'].apply(int)
    result = pd.concat([stars, d_f['text']], axis=1)
    
    with open(np, 'w') as f: 
        result.to_csv(f, index=False)


In [131]:
modify_csv('Project 1 Sentiment Classification/data/train.csv', 'mod_train.csv')

In [132]:
modify_csv('Project 1 Sentiment Classification/data/valid.csv', 'mod_valid.csv')

In [127]:
with open('mod_valid.csv', 'w') as f: 
    result.to_csv(f, index=False)


In [133]:
df = pd.read_csv('mod_valid.csv')

In [136]:
set(df.stars)

{1, 2, 3, 4, 5}

In [45]:
from tqdm import tqdm 

In [62]:
import re

In [59]:
import spacy

NLP = spacy.load('en_core_web_md')

In [76]:
def tokenizer(comment):
    # preprocessing using regular expression
    
    comment = comment.lower()
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;.,]", " ", 
        str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    return [
        x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [77]:
v_count = {} 

for text in tqdm(df['text']): 
    for t in tokenizer(text): 
        if t not in v_count: 
            v_count[t] = 1
        else:
            v_count[t] +=1 

100%|██████████| 100000/100000 [04:03<00:00, 410.84it/s]


In [78]:
sorted_v = list(v_count.items())

In [84]:
sorted_v.sort(key=lambda r: r[1])

In [94]:
sorted_v[:100]

[('thumper', 1),
 ('50gal', 1),
 ('5gpm', 1),
 ('tommaso', 1),
 ('saucing', 1),
 ('integrative', 1),
 ('personalised', 1),
 ('3514', 1),
 ('hygeine', 1),
 ('evading', 1),
 ('modes', 1),
 ('expaned', 1),
 ('thouroughly', 1),
 ('nitrite', 1),
 ('infuriatingly', 1),
 ('johnstown', 1),
 ('jalap', 1),
 ('demerara', 1),
 ("everyone'd", 1),
 ('hums', 1),
 ('funnily', 1),
 ("d'backs", 1),
 ('nack', 1),
 ('bullcrap', 1),
 ('gastroparesis', 1),
 ('titilate', 1),
 ('heloc', 1),
 ('150k', 1),
 ('okkkkkkkk', 1),
 ('hospitaliser', 1),
 ('commodités', 1),
 ('banque', 1),
 ('nationale', 1),
 ('bucanero', 1),
 ('nista', 1),
 ('ingreds', 1),
 ('flavorable', 1),
 ('aaactually', 1),
 ('stretchinnng', 1),
 ('hrmmmmm', 1),
 ('muuuuuuuuha', 1),
 ('haaah', 1),
 ('spookily', 1),
 ('narrate', 1),
 ('larious', 1),
 ('bordello', 1),
 ('opp', 1),
 ('pshaw', 1),
 ('yip', 1),
 ('laxatives', 1),
 ('マッカラン空港に近い、ラスベガスのストリップ大通り南側にあります', 1),
 ('エクスカリバー、ピラミッド形のラクソーとグループのホテルが３つ並んでいて、３つのホテルの間は無料のモノレールが利用できます。エクスカリバーの交差点を北側へ行

In [86]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [87]:
sorted_v = [t for t in sorted_v if t[0] not in stopwords]

In [90]:
len(sorted_v)

81961

In [91]:
count = 0

for word in sorted_v: 
    if word[1] == 1:
        count+= 1
        
print(count)

36133


In [6]:
test = pd.read_csv('Project 1 Sentiment Classification/data/test.csv')
test['words'] = test['text'].map(lambda x: x.lower()) 
test['words'] = test['words'].str.replace('[^\w\s]', '')  
test['words'] = test['words'].apply(tokenize)

# test['words'] = test['words'].apply(lambda x: [stemmer.stem(y) for y in x])  
# This converts the list of words into space-separated strings
test['words'] = test['words'].apply(lambda x: ' '.join(x))    
print(test.shape)

(10000, 10)


In [31]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [ (1, 1), (1, 2), (2, 2)],
              'tfidf__use_idf': (True, False),
              'clf__C': [1, 10, 0.1],
}

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial')),
])
# text_clf.fit(df['words'], df['label'])  

In [34]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [35]:
gs_clf = gs_clf.fit(df['words'][:400], df['label'][:400])

In [36]:
gs_clf.best_score_                                  

0.4425170685088869

In [37]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


clf__C: 10
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-2, random_state=42,
                                           max_iter=5, tol=None)),
])
text_clf.fit(df['words'], df['label'])  

In [None]:
predicted = text_clf.predict(test['words'])

In [None]:
sub_df = pd.DataFrame()
sub_df["id"] = test['id']
sub_df["pred"] = predicted
sub_df.to_csv("sgd.csv", index=False)

In [33]:
df['words'][9]

'excel stay hotel monaco past delight recept staff friendli profession room smart comfort bed particularli like recept small dog receiv staff guest spoke love mild neg distanc uphil ppmarket restaur 1st overal great experi'

In [84]:
train_id_list, train_data_label, train_data_matrix, vocab = read_data("all/train.csv")
print("Vocabulary Size:", len(vocab))
print("Training Set Size:", len(train_id_list))
test_id_list, _, test_data_matrix, _ = read_data("all/test.csv", vocab)
print("Test Set Size:", len(test_id_list))

Vocabulary Size: 70839
Training Set Size: 16000
Test Set Size: 4491


In [113]:
parameters = {'clf__alpha': [1e-2, 1e-1, 1],
              'clf__loss' : ['hinge', 'perceptron']
}

In [114]:
clf = Pipeline([
        ('clf', LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')),
              ])

In [115]:
gs_clf = GridSearchCV(clf, parameters, cv=5, iid=False, n_jobs=-1)

In [116]:
gs_clf = gs_clf.fit(train_data_matrix[:400], train_data_label[:400])

In [117]:
gs_clf.best_score_                                  

0.4593276337150476

In [118]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.1
clf__loss: 'hinge'


In [119]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_data_matrix, train_data_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [120]:
rf_result = rf.predict(test_data_matrix)

In [41]:
stem = pd.read_csv('log_stem.csv')
cnn = pd.read_csv('cnn.csv')

In [42]:
np.mean(stem['pred'] == cnn['pred'])


0.034958806501892675

In [128]:
np.mean(eclf_result == log['pred'])

0.7742150968603875

In [129]:
sub_df = pd.DataFrame()
sub_df["id"] = test_id_list
sub_df["pred"] = eclf_result
sub_df.to_csv("ensemble.csv", index=False)