### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from textblob import TextBlob

from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef

### Load Dataset

In [2]:
# training and testing data
# train = pd.read_csv('data/train.csv') # 25000
# test = pd.read_csv('data/test.csv')  # 2500

# since I already applied language detection and zero-shot classification on `train.csv` and the results are saved in `analyzed_train.csv`
analyze =  pd.read_csv('./data/analyzed_train.csv')
analyze.columns = analyze.columns.str.replace(' ', '') # remove blank space in column names for easier manipulation
analyze = analyze.drop(['Unnamed:0'], axis=1) # drop duplicated index column
analyze.head(30)

Unnamed: 0,content,sentiment,predicted_label,detected_language
0,Не подошло.,negative,negative,ru
1,"товара нет, деньги вернул",negative,positive,
2,"[옵션]문풍지종류선택:3M실내용,규격선택:4. 대형",neutral,positive,
3,Kirain Alice rokoknya esse change wkwk,neutral,negative,
4,El hotel ideal para descansar y relajarte. El ...,positive,positive,
5,"vendedor não cumpri com a venda, vende o produ...",negative,negative,
6,I agree with everybody who insulted this app i...,negative,negative,en
7,L'ex Kenny Lawson'on fire' contro il suo passa...,neutral,negative,
8,"Listrik Padam, 18 Kereta Api Terlambat",neutral,negative,id
9,This product works well. Priced well. softens ...,positive,positive,en


# EDA

I did an EDA before jumping into modeling to understand better **what kind of data**, **how many examples** there are, and **the goal of modeling**.

- **what kind of data** : 
    1. textual data of different length ;
    2. multilingual : 17 languages ;
    3. variety of user-generated : twitter-like with @mention, shopping review, etc ;
    4. presence of emoji (only or mixed with text) and various punctuation marks ;
    5. small presence of code-switching (usage of two or more languages in the same sentence or paragraph) ;
    6. 3 sentiment / polarity labels `positive`, `negative, `neutral`.
    
- **how many examples** :
    * 25000 samples in training set, no empty sample nor empty label
    slightly imbalanced
    1. 8823 neutral samples, 35.29% 
    2. 8318 positive samples, 33.27%
    3. 7858 negative samples, 31.43%

- **the goal of modeling** :
    * multi-class classification

In [3]:
# how many examples in each sentiment label in the training set
analyze.sentiment.value_counts() 

neutral       8823
positive      8318
negative      7858
unassigned       1
Name: sentiment, dtype: int64

In [4]:
# print out to see what is the 'unassigned' content
print(analyze[analyze.sentiment == 'unassigned'])

# delete the 1 sample in 'unassigned' label
# unfortunately I don't speak arabic and there is 
analyze = analyze.drop(analyze[analyze.sentiment == 'unassigned'].index)

                content   sentiment predicted_label detected_language
5657  ويلييي شو بتصرع💙💙  unassigned        positive               NaN


In [5]:
# percentage in each sentiment label in the training set
analyze.sentiment.value_counts() / analyze.sentiment.value_counts().sum()

neutral     0.352934
positive    0.332733
negative    0.314333
Name: sentiment, dtype: float64

In [6]:
# number of contents whose language hasn't been detected
print('Num of contents without languaged detected: ', analyze.detected_language.isna().sum())

# number of langages detected in content column
print('Num of languaged detected: ', analyze.detected_language.value_counts().nunique())

# list of 17 detected languages
print('List of 17 detected languages: ', analyze.detected_language.value_counts().index.tolist())

Num of contents without languaged detected:  24517
Num of languaged detected:  17
List of 17 detected languages:  ['en', 'id', 'ru', 'ar', 'es', 'fr', 'pt', 'it', 'zh-CN', 'ms', 'ja', 'ko', 'vi', 'th', 'zh-TW', 'hi', 'tl', 'km', 'de', 'tr', 'nl', 'da', 'uk', 'az', 'et', 'fa', 'bn', 'pl', 'hr', 'ml', 'ur', 'gu', 'lt', 'bg', 'iw', 'gd', 'ceb', 'ta', 'sv', 'el', 'sk', 'ca', 'lv', 'so', 'ro']


In [7]:
analyze['text_char_length'] = analyze['content'].apply(lambda row: len(row))
analyze['text_char_length'].value_counts()

62      288
59      273
55      268
57      266
53      261
       ... 
1347      1
1424      1
833       1
852       1
1604      1
Name: text_char_length, Length: 1010, dtype: int64

In [8]:
analyze[analyze['text_char_length'] == 1]

Unnamed: 0,content,sentiment,predicted_label,detected_language,text_char_length
2675,💚,positive,positive,,1
21743,),negative,positive,,1


In [9]:
analyze[analyze['text_char_length'] == 2]

Unnamed: 0,content,sentiment,predicted_label,detected_language,text_char_length
1306,데백,positive,neutral,,2
2235,Xi,positive,positive,,2
3536,Kt,negative,neutral,,2
3774,피피,negative,positive,,2
4495,হজ,positive,positive,,2
5017,بص,positive,positive,,2
5317,Ii,negative,positive,,2
5682,Yv,positive,positive,,2
5911,سم,positive,negative,,2
5985,TB,positive,negative,,2


In [10]:
analyze[analyze['text_char_length'] == 3]

Unnamed: 0,content,sentiment,predicted_label,detected_language,text_char_length
76,Dhh,negative,neutral,,3
384,期待！,neutral,positive,,3
460,박지성,neutral,positive,ko,3
928,Ляд,neutral,neutral,,3
1200,ااا,negative,positive,,3
1757,তার,negative,neutral,,3
1877,تلا,negative,neutral,,3
2270,제밌어,positive,positive,,3
2279,gzb,positive,negative,,3
2565,Tg.,negative,positive,,3


#### Language Detection using Textblob

Detect the blob’s language using the Google Translate API and it requires an internet connection. One caveat - it requires at minimun 3 characters to recognize the langage, but sometimes even with a few words Textblob may not succed to detect the language. 

(ps. to uncomment multiple lines, just `select all` and `ctrl`+`/`)

In [None]:
## function : use Textblob to detect languages of the content

# def language_detect(text):
#     try:
#         return TextBlob(text).detect_language()
#     # if there is error catched, just pass
#     except: 
#         pass

# train['language'] = train.content.apply(lambda row: language_detect(row))


### Baseline : Zero-Shot Classification 

I use `joeddav/xlm-roberta-large-xnli`, which is the version of `xlm-roberta-large` fine-tuned on NLI natural language inference datasets in 15 langauges. `transformers.ZeroShotClassificationPipeline` can only be used with models that have been fine-tuned on NLI tasks. 

(ps. to uncomment multiple lines, just `select all` and `ctrl`+`/`)

In [None]:
# from transformers import pipeline
# classifier = pipeline("zero-shot-classification",
#                       model="joeddav/xlm-roberta-large-xnli")

# def baseline(sequence_to_classify):
#     """
#     Use XLM-R to make zero-shot sentiment analysis as a baseline model.
    
#     Parameters
#     ----------
#     sequence_to_classify : str
#         A string to feed into baseline model to make prediction
    
#     Returns
#     ----------
#     predicted_label : str
#         A string of which sentiment (positive, negative, neutral) is predicted by the baseline model
#     """
#     candidate_labels = ["positive", "negative", "neutral"]
#     result = classifier(sequence_to_classify, candidate_labels)
#     result_label_list = result['labels']
#     score_argmax_index = np.array(result['scores']).argmax()
#     predicted_label = result_label_list[score_argmax_index]
#     return predicted_label

# # baseline model makes zero-shot sentiment prediction on training set
# df["predicted_label"] = df.content.apply(lambda row: baseline(row))

In [15]:
#  use accuracy_score, f1_score, matthews_corrcoef from sklearn to evaluate baseline model performance
y_true = analyze.sentiment.values
y_pred_zero_shot = analyze.predicted_label.values

print('accuracy :',accuracy_score(y_true, y_pred_zero_shot))
print('F1 :',f1_score(y_true, y_pred_zero_shot, average='weighted'))
print('MCC :',matthews_corrcoef(y_true, y_pred_zero_shot))

accuracy : 0.52952
F1 : 0.4567419027211373
MCC : 0.3399870617694973
