## LAB II DATA MINING 
This part is worth 30% of your grade. A report of your work developping the model for the competition (You can use code and comment it). This report should include what your preprocessing steps, the feature engineering steps and an explanation of your model. You can also mention different things you tried and insights you gained.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize

# 1. Data Preparation
## 1.1 Load data

In [4]:
# read files 
data_identification = pd.read_csv("../input/lab2-data-mining/data_identification.csv")
emotion = pd.read_csv("../input/lab2-data-mining/emotion.csv")
sampleSubmission = pd.read_csv("../input/lab2-data-mining/sampleSubmission.csv")
raw_data = pd.read_json("../input/lab2-data-mining/tweets_DM.json", lines=True,orient='columns')

In [5]:
# check the identification data
data_identification[:10]

In [6]:
# check emotion data
emotion[:10]

In [7]:
# check the raw data        
raw_data[:10]

In [8]:
# split the source columns
df = pd.json_normalize(data=raw_data['_source'])

# rename the column names of source
df=df.rename(index=str,columns={"tweet.text":"text", "tweet.tweet_id":"tweet_id","tweet.hashtags":"hashtags"})

In [9]:
# check normalized and renamed raw data
df[:10]

In [10]:
# add identification the dataframe
df=pd.merge(df,data_identification, on="tweet_id")

## Clean text coloumn

In [11]:
# clean the text
import re
from string import punctuation

def preprocess_text(text):
    text = text.lower()  #  lowercase text
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # remove punctuation
    text = " ".join(text.split())  # remove extra spaces, tabs, and new lines
    return text

df['text'] = df['text'].map(preprocess_text)

# check the clean text
df[:10]

## 1.3 Train and test data splitting

In [12]:
# split into train and test dataset
train_df = df[df['identification']=='train']
train_df = pd.merge(train_df, emotion, on='tweet_id')

test_df = df[df['identification']=='test']
test_df["emotion"]=""

In [13]:
# drop the identification and hashtags column
train_df = train_df.drop(['identification'], axis=1)
test_df = test_df.drop(['identification'], axis=1)
train_df = train_df.drop(['hashtags'], axis=1)
test_df = test_df.drop(['hashtags'], axis=1)

#check the train dataset
train_df[:10]

In [14]:
#check the test dataset
test_df[:10]

In [15]:
#group to find distribution
train_df.groupby(['emotion']).count()['text']

## Histogram emoticon distribution

In [16]:
# the histogram of the data
labels = train_df['emotion'].unique()
post_total = len(train_df)
df1 = train_df.groupby(['emotion']).count()['text']
df1 = df1.apply(lambda x: round(x*100/post_total,3))

#plot
fig, ax = plt.subplots(figsize=(10, 5))
plt.bar(df1.index,df1.values)

#arrange
plt.ylabel('% of instances')
plt.xlabel('Emotion')
plt.title('Emotion distribution')
plt.grid(True)
plt.show()

# 2. Feature Engineering
## 2.1 TF-IDF Vectorizer & Tweet Tokenizer

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# create a function for the tweet tokenizer from NLTK
def tknzr(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

#Use TF-IDF to tokenize document
vectorizer = TfidfVectorizer(min_df=20, max_df=0.95, ngram_range=(1,1), stop_words='english', tokenizer=tknzr).fit(train_df['text'])
tfidf_transformed = vectorizer.transform(train_df['text'])

## 2.2 Count Vectorizer & Tweet Tokenizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer

# create a function for the tweet tokenizer from NLTK
def tknzr(text):
    tt = TweetTokenizer()
    return tt.tokenize(text)

count_vect = CountVectorizer(max_features=100000, tokenizer=tknzr).fit(train_df['text'])
bow_transformed = count_vect.transform(train_df['text'])

# 3. Model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow_transformed, train_df['emotion'], test_size=0.2, random_state=1)

In [None]:
# take a look at data dimension
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

In [None]:
print(train_df.shape)
print(test_df.shape)

## 3.1 Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# build MultinomialNB model
NB_model = MultinomialNB()

# train the model
NB_model = NB_model.fit(X_train, y_train)

# predict the model 
y_train_pred = NB_model.predict(X_train)
y_test_pred = NB_model.predict(X_test)

In [None]:
# check the model accuracy
from sklearn.metrics import accuracy_score

acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
acc_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)

print('training accuracy: {}'.format(round(acc_train, 2)))
print('testing accuracy: {}'.format(round(acc_test, 2)))

In [None]:
# precision, recall, f1-score,
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=y_test_pred))

In [None]:
# check by confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=y_test, y_pred=y_test_pred) 
print(cm)

In [None]:
# function for visualizing confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes, title='Confusion matrix',
                          cmap=sns.cubehelix_palette(as_cmap=True)):
    """
    This function is modified from: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    """
    classes.sort()
    tick_marks = np.arange(len(classes))    
    
    fig, ax = plt.subplots(figsize=(10,10))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels = classes,
           yticklabels = classes,
           title = title,
           xlabel = 'True label',
           ylabel = 'Predicted label')

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    ylim_top = len(classes) - 0.5
    plt.ylim([ylim_top, -.5])
    plt.tight_layout()
    plt.show()

# OUTPUT

In [None]:
train_df['emotion'].unique()

In [None]:
# plot your confusion matrix
my_tags = ['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust',
       'disgust', 'surprise']
plot_confusion_matrix(cm, classes=my_tags, title='Confusion matrix')

In [None]:
X_train = count_vect.transform(train_df['text'])
y_train = train_df['emotion']
X_test = count_vect.transform(test_df['text'])

In [None]:
# train the model
NB_model = NB_model.fit(X_train, y_train)

# predict the model
prediction = NB_model.predict(X_test)

# save as the output as csv
test_df['emotion'] = prediction
output = test_df[['tweet_id', 'emotion']].copy()
output = output.set_axis(["id", "emotion"], axis=1)
output.to_csv("bow-naive-bayes.csv", index=False)
test_df

In [None]:
output