# Import and load data

### Import

In [1]:
import csv
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

!pip install emoji
import emoji # to remove emojies

from scipy import sparse 

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report


from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 7.3 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.1.0-py3-none-any.whl size=212392 sha256=d163486927312ccbe63234fd04028646b130b4a2e642035b6519dbf2a3f35899
  Stored in directory: /root/.cache/pip/wheels/77/75/99/51c2a119f4cfd3af7b49cc57e4f737bed7e40b348a85d82804
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.1.0


### Load data

In [2]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [3]:
from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [4]:
df_train_dev = load_dataset(url_train_dev)
df_test = load_dataset(url_test)

# Preprocessing
### Split data into train, test sets

In [5]:
# Remove lables not in train but appear in test
drop_lables = ["mr","mn","eu","lv","la","zu","yo","mk","sk"]
drop_indexes=[]
for l in drop_lables:
  drop_indexes.append(df_test[df_test["label"]==l].index.values)
drop_indexes = np.concatenate(drop_indexes)

In [6]:
X_train = df_train_dev.tweet
y_train = df_train_dev.label
X_test = df_test.tweet
y_test = df_test.label
for index in drop_indexes:
  X_test=X_test.drop(index)
  y_test=y_test.drop(index)

### Encode lables

In [7]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

### Functions to process data

In [8]:
# Convert to lower case
def convert_to_lowercase(X):
  return X.str.lower()

# Removing emojies
def remove_emojies(X):
  return X.apply(lambda x: emoji.replace_emoji(x, replace=''))

def remove_urls(X):
  return X.apply(lambda x:re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", x))

# Average number of words in a sentence
def average_words(X):
  word_average = [ ((tweet.count(" ")+1) if tweet.count(".")==0 else ((tweet.count(" ")+1)/tweet.count(".")) for tweet in X)]
  scaler = MinMaxScaler()
  word_average=pd.DataFrame(word_average)
  word_average=scaler.fit_transform(word_average)
  return word_average.T

# Number of spaces in a tweet
def spaces(X):
  spaces = [ tweet.count(" ") for tweet in X]
  scaler = MinMaxScaler()
  spaces=pd.DataFrame(spaces)
  spaces=scaler.fit_transform(spaces)
  return spaces

# Multilayer Perceptron


### MLP1

In [None]:
# 2 hidden layers size 100,100
# SGD
# reul
# early stopping
mlp1 = Pipeline([
    ('lowercase_transformer',FunctionTransformer(convert_to_lowercase)),
    ('emojies_transformer',FunctionTransformer(remove_emojies)),
    ('url_transformer', FunctionTransformer(remove_urls)),
    ('union',FeatureUnion(
        [
            # feature average number of words
            ('average_words_transformer', FunctionTransformer(average_words)),

            # feature # of spaces
            ('spaces_transformer', FunctionTransformer(spaces)),

            # vetorize words (ignore those that appear less than 3 times)
            ("word_vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=3)),
            
            # vetorize char (ignore those that appear less than 50 times)
            ("char_vectorizer", TfidfVectorizer(analyzer='char', min_df=50,ngram_range=(3,3)))
        ]
      )
    ),
    ('mlp', MLPClassifier(solver='sgd',hidden_layer_sizes=(100,100),activation='relu',early_stopping=True))
])
mlp1.fit(X_train,y_train)

Pipeline(steps=[('lowercase_transformer',
                 FunctionTransformer(func=<function convert_to_lowercase at 0x7f03124017a0>)),
                ('emojies_transformer',
                 FunctionTransformer(func=<function remove_emojies at 0x7f0312401b00>)),
                ('url_transformer',
                 FunctionTransformer(func=<function remove_urls at 0x7f0312401e60>)),
                ('union',
                 FeatureUnion(transformer_list=[('average_words_tran...
                                                 FunctionTransformer(func=<function average_words at 0x7f031240a050>)),
                                                ('spaces_transformer',
                                                 FunctionTransformer(func=<function spaces at 0x7f031240a200>)),
                                                ('word_vectorizer',
                                                 TfidfVectorizer(min_df=3,
                                                                 ng

In [None]:
y_pred_MLP_1 =  mlp1.predict(X_test)

correct = 0

for index, prediction in enumerate(y_pred_MLP_1):
    if prediction == y_test[index]:
        correct +=1

print('Accuracy: ', correct/y_test.shape[0])

Accuracy:  0.35868827742178666


In [None]:
print(classification_report(y_test,y_pred_MLP_1))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       529
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00        50
          12       0.00      0.00      0.00        11
          13       0.36      1.00      0.53      4758
          14       0.00      0.00      0.00      1476
          16       0.00      0.00      0.00         5
          17       0.00      0.00      0.00         8
          18       0.00      0.00      0.00       224
          21       0.00      0.00      0.00        14
          22       0.00      0.00      0.00         4
          23       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### MLP2

In [None]:
# 1 hidden layer size 100
# SGD
# relu
# Early stopping
mlp2 = Pipeline([
    ('lowercase_transformer',FunctionTransformer(convert_to_lowercase)),
    ('emojies_transformer',FunctionTransformer(remove_emojies)),
    ('url_transformer', FunctionTransformer(remove_urls)),
    ('union',FeatureUnion(
        [
            # feature average number of words
            ('average_words_transformer', FunctionTransformer(average_words)),

            # feature # of spaces
            ('spaces_transformer', FunctionTransformer(spaces)),

            # vetorize words (ignore those that appear less than 3 times)
            ("word_vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=3)),
            
            # vetorize char (ignore those that appear less than 50 times)
            ("char_vectorizer", TfidfVectorizer(analyzer='char', min_df=50,ngram_range=(3,3)))
        ]
      )
    ),
    ('mlp', MLPClassifier(solver='sgd',activation='relu',early_stopping=True))
])
mlp2.fit(X_train,y_train)

Pipeline(steps=[('lowercase_transformer',
                 FunctionTransformer(func=<function convert_to_lowercase at 0x7f609d7dd200>)),
                ('emojies_transformer',
                 FunctionTransformer(func=<function remove_emojies at 0x7f609d7dd3b0>)),
                ('url_transformer',
                 FunctionTransformer(func=<function remove_urls at 0x7f609d7dd560>)),
                ('union',
                 FeatureUnion(transformer_list=[('average_words_tran...
                                                 FunctionTransformer(func=<function average_words at 0x7f609d7dd710>)),
                                                ('spaces_transformer',
                                                 FunctionTransformer(func=<function spaces at 0x7f609d7dd8c0>)),
                                                ('word_vectorizer',
                                                 TfidfVectorizer(min_df=3,
                                                                 ng

In [None]:
y_pred_MLP_2 =  mlp2.predict(X_test)

correct = 0

for index, prediction in enumerate(y_pred_MLP_2):
    if prediction == y_test[index]:
        correct +=1

print('Accuracy: ', correct/y_test.shape[0])

Accuracy:  0.35868827742178666


In [None]:
print(classification_report(y_test,y_pred_MLP_2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       529
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00        50
          12       0.00      0.00      0.00        11
          13       0.36      1.00      0.53      4758
          14       0.00      0.00      0.00      1476
          16       0.00      0.00      0.00         5
          17       0.00      0.00      0.00         8
          18       0.00      0.00      0.00       224
          21       0.00      0.00      0.00        14
          22       0.00      0.00      0.00         4
          23       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### MLP3

In [None]:
# 1 hidden layer size 100
# Adam
# relu
# Early stopping
mlp3 = Pipeline([
    ('lowercase_transformer',FunctionTransformer(convert_to_lowercase)),
    ('emojies_transformer',FunctionTransformer(remove_emojies)),
    ('url_transformer', FunctionTransformer(remove_urls)),
    ('union',FeatureUnion(
        [
            # feature average number of words
            ('average_words_transformer', FunctionTransformer(average_words)),

            # feature # of spaces
            ('spaces_transformer', FunctionTransformer(spaces)),

            # vetorize words (ignore those that appear less than 3 times)
            ("word_vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=3)),
            
            # vetorize char (ignore those that appear less than 50 times)
            ("char_vectorizer", TfidfVectorizer(analyzer='char', min_df=50,ngram_range=(3,3)))
        ]
      )
    ),
    ('mlp', MLPClassifier(activation='relu',early_stopping=True))
])
mlp3.fit(X_train,y_train)

Pipeline(steps=[('lowercase_transformer',
                 FunctionTransformer(func=<function convert_to_lowercase at 0x7f609d7dd200>)),
                ('emojies_transformer',
                 FunctionTransformer(func=<function remove_emojies at 0x7f609d7dd3b0>)),
                ('url_transformer',
                 FunctionTransformer(func=<function remove_urls at 0x7f609d7dd560>)),
                ('union',
                 FeatureUnion(transformer_list=[('average_words_transformer',
                                                 FunctionTransformer(func=<function average_words at 0x7f609d7dd710>)),
                                                ('spaces_transformer',
                                                 FunctionTransformer(func=<function spaces at 0x7f609d7dd8c0>)),
                                                ('word_vectorizer',
                                                 TfidfVectorizer(min_df=3,
                                                             

In [None]:
y_pred_MLP_3 =  mlp3.predict(X_test)

correct = 0

for index, prediction in enumerate(y_pred_MLP_3):
    if prediction == y_test[index]:
        correct +=1

print('Accuracy: ', correct/y_test.shape[0])
print(classification_report(y_test,y_pred_MLP_3))

Accuracy:  0.8874481718808895
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       529
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.89      0.78      0.83        50
          12       0.00      0.00      0.00        11
          13       0.95      0.97      0.96      4758
          14       0.94      0.96      0.95      1476
          16       0.00      0.00      0.00         5
          17       0.00      0.00      0.00         8
          18       0.96      0.90      0.93       224
          21       0.00      0.00      0.00        14
          22       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### MLP 4

In [None]:
# 2 hidden layer size 75 75
# Adam
# relu
# Early stopping
mlp4 = Pipeline([
    ('lowercase_transformer',FunctionTransformer(convert_to_lowercase)),
    ('emojies_transformer',FunctionTransformer(remove_emojies)),
    ('url_transformer', FunctionTransformer(remove_urls)),
    ('union',FeatureUnion(
        [
            # feature average number of words
            ('average_words_transformer', FunctionTransformer(average_words)),

            # feature # of spaces
            ('spaces_transformer', FunctionTransformer(spaces)),

            # vetorize words (ignore those that appear less than 3 times)
            ("word_vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=3)),
            
            # vetorize char (ignore those that appear less than 50 times)
            ("char_vectorizer", TfidfVectorizer(analyzer='char', min_df=50,ngram_range=(3,3)))
        ]
      )
    ),
    ('mlp', MLPClassifier(activation='relu', hidden_layer_sizes=(75,75),early_stopping=True))
])
mlp4.fit(X_train,y_train)

In [None]:
y_pred_MLP_4 =  mlp4.predict(X_test)

correct = 0

for index, prediction in enumerate(y_pred_MLP_4):
    if prediction == y_test[index]:
        correct +=1

print('Accuracy: ', correct/y_test.shape[0])
print(classification_report(y_test,y_pred_MLP_4))

### MLP 5

In [None]:
# 1 hidden layer size 100
# Adam
# relu
# Early stopping
# use bi gram char and tri gram char
mlp5 = Pipeline([
    ('lowercase_transformer',FunctionTransformer(convert_to_lowercase)),
    ('emojies_transformer',FunctionTransformer(remove_emojies)),
    ('url_transformer', FunctionTransformer(remove_urls)),
    ('union',FeatureUnion(
        [
            # feature average number of words
            ('average_words_transformer', FunctionTransformer(average_words)),

            # feature # of spaces
            ('spaces_transformer', FunctionTransformer(spaces)),

            # vetorize words (ignore those that appear less than 3 times)
            ("word_vectorizer", TfidfVectorizer(ngram_range=(1,3), min_df=3)),
            
            # vetorize char (ignore those that appear less than 50 times)
            ("char_vectorizer", TfidfVectorizer(analyzer='char', min_df=50,ngram_range=(3,3)))
        ]
      )
    ),
    ('mlp', MLPClassifier(activation='relu',early_stopping=True))
])
mlp5.fit(X_train,y_train)

In [None]:
y_pred_MLP_5 =  mlp5.predict(X_test)

correct = 0

for index, prediction in enumerate(y_pred_MLP_5):
    if prediction == y_test[index]:
        correct +=1

print('Accuracy: ', correct/y_test.shape[0])
print(classification_report(y_test,y_pred_MLP_5))

In [None]:
from joblib import load, dump

dump(mlp, 'MLP1.joblib')
dump(mlp2, 'MLP2.joblib')
dump(mlp3, 'MLP3.joblib')
dump(mlp4, 'MLP4.joblib')
dump(mlp5, 'MLP5.joblib')

['MLP5.joblib']