In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns

from nltk import word_tokenize
from tqdm.notebook import tqdm

#Avoid warning messages
import warnings
warnings.filterwarnings("ignore")

#plotly libraries
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
from plotly.subplots import make_subplots
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
from textblob import TextBlob
import emoji  # https://pypi.org/project/emoji/
import re
import fasttext

In [None]:
def readData(path):
    df=pd.read_csv(path)
    df.drop('review_id', axis=1, inplace=True)
    return df

def lowerCase(df):
    df['review']=df['review'].str.lower()
    return df

def rearrange(df):
    df['rating'] = df['rating'].replace(to_replace=1, value="__label__1")
    df['rating'] = df['rating'].replace(to_replace=2, value="__label__2")
    df['rating'] = df['rating'].replace(to_replace=3, value="__label__3")
    df['rating'] = df['rating'].replace(to_replace=4, value="__label__4")
    df['rating'] = df['rating'].replace(to_replace=5, value="__label__5")
    cols = df.columns.tolist()
    cols = cols[-1:]+cols[:-1]
    df=df[cols]
    return df

In [None]:
def emoji_process(df):
    
    have_emoji_idx = []
    
    for idx, review in enumerate(df['review']):
        if any(char in emoji.UNICODE_EMOJI for char in review):
            have_emoji_idx.append(idx)
            
    def emoji_cleaning(text):
    
        # Change emoji to text
        text = emoji.demojize(text).replace(":", " ")

        # Delete repeated emoji
        tokenizer = text.split()
        repeated_list = []

        for word in tokenizer:
            if word not in repeated_list:
                repeated_list.append(word)

        text = ' '.join(text for text in repeated_list)
        text = text.replace("_", " ").replace("-", " ")
        
        return text

    # emoji_cleaning
    df.loc[have_emoji_idx, 'review'] = df.loc[have_emoji_idx, 'review'].apply(emoji_cleaning)
    
    return df

In [None]:
def review_clean(df):

    def review_cleaning(text):

        # change emoticon to text
        text = re.sub(r':\(', 'dislike', text)
        text = re.sub(r': \(\(', 'dislike', text)
        text = re.sub(r':, \(', 'dislike', text)
        text = re.sub(r':\)', 'smile', text)
        text = re.sub(r';\)', 'smile', text)
        text = re.sub(r':\)\)\)', 'smile', text)
        text = re.sub(r':\)\)\)\)\)\)', 'smile', text)
        text = re.sub(r'=\)\)\)\)', 'smile', text)

        # delete punctuation
        text = re.sub('[^a-z0-9 ]', ' ', text)

        tokenizer = text.split()

        return ' '.join([text for text in tokenizer])
    df["review"]=df["review"].apply(review_cleaning)
    return df

In [None]:
trainFilePath="/kaggle/input/student-shopee-code-league-sentiment-analysis/train.csv"
testFilePath="/kaggle/input/student-shopee-code-league-sentiment-analysis/test.csv"
#modelPath="/kaggle/input/student-shopee-code-league-sentiment-analysis"
train=readData(trainFilePath)
test=readData(testFilePath)
train=lowerCase(train)
test=lowerCase(test)
#train=emoji_process(train)
train=review_clean(train)
train=rearrange(train)
#print(len(train.columns.tolist()))
print(train.head(30))

In [None]:
train.to_csv(r'/kaggle/working/trainK2.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
# fasttext

hyper_params = { 
    "lr": 0.35,         # Learning rate
    "epoch": 100,       # Number of training epochs to train for
    "wordNgrams": 3,    # Number of word n-grams to consider during training
    "dim": 155,         # Size of word vectors
    "ws": 5,            # Size of the context window for CBOW or skip-gram
    "minn": 2,          # Min length of char ngram
    "maxn": 5,          # Max length of char ngram
    "bucket": 2014846,  # Number of buckets
}

def trainer(filepath: str, hyper_params: dict):
    
    model = fasttext.train_supervised(input=filepath, **hyper_params)
    print("FastText model trained with hyperparameters: \n {}".format(hyper_params))
    return model
    # Save models to model directory for fasttext



model=trainer("/kaggle/working/trainK2.txt",hyper_params)

In [None]:
def scoreFT(text: str) -> int:
        # Predict just the top label (hence 1 index below)
    labels, probabilities = model.predict(text, 1)
    pred = int(labels[0][-1])
    return pred

def predictFT(df) -> pd.DataFrame:
    df['pred'] = df['review'].apply(scoreFT)
    return df

train=predictFT(train)
print(train.head(30))


In [None]:
def evaluateFT(trainPred, trainOrig):
    total=len(trainPred)
    correct=np.sum(np.where(trainOrig['rating']==trainPred['pred'],1,0))
    return correct/total
trainOrig=readData(trainFilePath)
print(evaluateFT(train, trainOrig))
print(trainOrig.head(100))
print(pred_train.head(100))