# AI Cell Me Competition
## I Summary
### Dataset

## II Analysis
### 2.1. Download dataset
If you're in Kaggle ![CellMe](https://www.kaggle.com/c/cell-me) competition and have installed kaggle api you can use below code to download dataset.

In [1]:
import os
import zipfile
wdir = os.getcwd()

if not os.path.isfile(wdir+"/data/train.csv.zip"):
    os.popen("~/.local/bin/kaggle competitions download -p ./data cell-me").read()
    
if not os.path.isfile(wdir+"/data/test.csv"):
    with zipfile.ZipFile(wdir +"/data/test.csv.zip","r") as zip_ref:
        zip_ref.extractall(wdir+"/data")
        
if not os.path.isfile(wdir+"/data/train.csv"):
    with zipfile.ZipFile(wdir +"/data/train.csv.zip","r") as zip_ref:
        zip_ref.extractall(wdir+"/data")

test_file_path = "data/test.csv"
train_file_path = "data/train.csv"

### 2.2. Loading libraries

In [2]:
import gensim
from gensim import corpora
import logging
import nltk
import re
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

  from numpy.core.umath_tests import inner1d


### 2.3. Pieces of code

#### 2.3.0 Set seed

In [3]:
random.seed(23)
seed = 23
num_of_topics = 5

#### 2.3.0. Load data

In [4]:
def loader(file_path,sample=False,nrows=1000):
    global wdir
    if not sample:
        return pd.read_csv(wdir+"/"+file_path,sep=",",header=0)
    return pd.read_csv(wdir+"/"+file_path,sep=",",header=0,nrows=nrows)

#### 2.3.1. Tokenizer 

In [None]:
class Tokenizer():
    @staticmethod
    def tokenize(text):
        pass

#### 2.3.2 Useful regexs

In [None]:
RE_EMOTICON = re.compile('(:\)|:-\)|:\(|:-\(|;\);-\)|:-O|8-|:P|:D|:\||:S|:\$|:@|8o\||\+o\(|\(H\)|\(C\)|\(\?\))')
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")

#### 2.3.3. Word tokenizer

In [None]:
class WordTokenizer(Tokenizer):
    def tokenize(text):
        words = str(text).lower().split()
        return words

#### 2.3.4. Stopwords

In [None]:
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
            "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
            "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "of",
            "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "shan't", "she", "she'd",
            "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs",
            "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're",
            "they've", "this", "those", "through", "to", "until", "up", "very", "was", "wasn't", "we", "we'd",
            "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's",
            "which", "while", "who", "who's", "whom", "with", "would", "you", "you'd", "you'll", "you're", "you've",
            "your", "yours", "yourself", "yourselves", "above", "again", "against", "aren't", "below", "but", "can't",
            "cannot", "couldn't", "didn't", "doesn't", "don't", "down", "few", "hadn't", "hasn't", "haven't", "if",
            "isn't", "mustn't", "no", "nor", "not", "off", "out", "over", "shouldn't", "same", "too", "under", "why",
            "why's", "won't", "wouldn't",",","."]

#### 2.3.5. Review tokenizer

In [None]:
class ReviewTokenizer(WordTokenizer):
    def tokenize(text,stopwords):
        tokens = super(ReviewTokenizer, ReviewTokenizer).tokenize(text)
        i = 0
        while i < len(tokens):
            token = tokens[i]
            matches = RE_EMOTICON.search(token)
            if matches:
                emoticon = matches.group(0)
                newTokens = token.split(emoticon)
                tokens[i] = emoticon
                tokens.extend(newTokens)
            else:
                del tokens[i]
                tokens[i:i] = nltk.word_tokenize(token)
          
            i = i + 1
        return [word for word in tokens if word not in stopwords]

#### 2.3.5. Histogram

In [None]:
#from Ph.D. D.B. tutorial
sns.set(style="whitegrid")
sns.set_color_codes("muted")

def show_histogram(word_counts, title=None):
    plot_df = pd.DataFrame.from_dict(word_counts,orient="index").reset_index().rename(columns={0:'Count'})
    f, ax = plt.subplots(figsize=(12, 15))
    p = sns.barplot(x="Count", y="index", data=plot_df, color="b")
    p.set(xlabel="Count", ylabel="", title=title)

#### 2.3.6 Remove once-occur words

In [None]:
frequency = defaultdict(int)
def _remove_once_occurence_words(text):
    global frequency;
    return [token for token in text if token in frequency and frequency[token]>1]

def remove_once_occurence_words(texts):
    global frequency
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 1]
        for text in texts]
    return texts

#### 2.3.7 Build dictionary

In [None]:
def build_dictionary(unique_texts):
    dictionary = corpora.Dictionary(unique_texts)
    dictionary.save('tmp/dictionary.dict')
    return dictionary

#### 2.3.8 Create corpus

In [None]:
def build_corpus(unique_texts,dictionary):
    corpus = [dictionary.doc2bow(text) for text in unique_texts]
    #it's very important to save
    corpora.MmCorpus.serialize('tmp/corpus.mm', corpus)
    return corpus

#### 2.3.9 Build lda model

In [None]:
def build_topic_model(corpus,id2word,num_topics=10,passes=20):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    #lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, update_every=0, passes=passes)
    lda = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, passes=passes)
    return lda

#### 2.3.10 Replace texts with topic prediction

In [None]:
def dealing_with_reviews(data,stopwords,num_of_topics):
    texts = list([])
    for i in data.index:
        tokens = ReviewTokenizer.tokenize(data.Reviews.iat[i],stopwords)
        texts.append(tokens)
    
    texts = remove_once_occurence_words(texts)
    dictionary = build_dictionary(texts)
    corpus = build_corpus(texts,dictionary)
    lda = build_topic_model(corpus, dictionary,num_topics=num_of_topics)
    empty_dict = {range(0,num_of_topics):0}
    
    slen = len(data['Reviews'])
    for j in range(0,num_of_topics):
        data['topic_'+str(j)] = pd.Series(range(0,slen),index=data.index,dtype=float)
    
    
    for i in data.index:
        tokens = ReviewTokenizer.tokenize(data.Reviews.iat[i],stopwords)
        unique_tokens = _remove_once_occurence_words(tokens)
        new_mm = dictionary.doc2bow(unique_tokens)
        topics = dict(lda[new_mm])
        for j in range(0,num_of_topics):
            if (not j in topics):
                data['topic_'+str(j)].iat[i] = 0
            else:
                data['topic_'+str(j)].iat[i] = topics[j]
        
    return data

### 2.4 Analysis

#### 2.4.0 Word frequency

In [None]:
#data = loader(train_file_path)
#counter = Counter()
#for i in data.index:
#    words = ReviewTokenizer.tokenize(data.Reviews.iat[i])
#    counter.update(words);

# full data is too large
#show_histogram(counter,"Words frequency")
#import csv
#my_dict = dict(counter)

#with open('mycsvfile.csv', 'w') as f:  # Just use 'w' mode in 3.x
#    w = csv.writer(f)
#    w.writerow(my_dict.keys())
#    w.writerow(my_dict.values())

In [None]:
#test
data = loader(train_file_path)

data = dealing_with_reviews(data,stopwords,num_of_topics)


    

In [None]:


y = pd.factorize(data['Rating'])[0]
data['Product Name'] = pd.factorize(data['Product Name'])[0]
data['Brand Name'] = pd.factorize(data['Product Name'])[0]
features = data.columns[:11].drop(['Rating','Reviews','Id'])

X_train, X_test, y_train, y_test = train_test_split(data[features], y, test_size = 0.25, random_state = seed)


clf = RandomForestClassifier(n_estimators=1,  random_state=seed)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
errors = abs(predictions - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
print('RMSE:', mean_squared_error(y_test, predictions))
