In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', None)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, make_scorer
from sklearn.model_selection import GridSearchCV
import mord

In [4]:
df1 = pd.read_csv("/kaggle/input/semeval/twitter-2016dev-A.tsv", delimiter = "\t", header = None, names = ["timestamp", "label", "text"])
df2 = pd.read_csv("/kaggle/input/semeval/twitter-2016train-A.tsv", delimiter = "\t", header = None, names = ["timestamp", "label", "text"])
df3 = pd.read_csv("/kaggle/input/semeval/twitter-2016test-A.tsv", sep='\t', header=None, names = ["timestamp", "label", "text"], usecols=[0, 1, 2])

In [8]:
df1.drop_duplicates(subset=['label', "text"], inplace = True)
df2.drop_duplicates(subset=['label', "text"], inplace = True)
df3.drop_duplicates(subset=['label', "text"], inplace = True)

In [9]:
def clean_text(text):
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'&[a-zA-Z]+;', '', text)
    text = re.sub(r'lol', '<lolface>', text, flags=re.IGNORECASE)
    text = re.sub(r':\)', '<smile>', text)
    text = re.sub(r':\(', '<sadface>', text)
    
    text = text.lower()
    
    text = re.sub(r'\bu\b', 'you', text)
   
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
         
    return text

In [10]:
df1["text"] = df1["text"].apply(clean_text)
df2["text"] = df2["text"].apply(clean_text)
df3["text"] = df3["text"].apply(clean_text)

In [11]:
replace_dict = {
    "neutral":1,
    "positive":2,
    "negative":0
}

df1["label"] = df1["label"].replace(replace_dict)
df2["label"] = df2["label"].replace(replace_dict)
df3["label"] = df3["label"].replace(replace_dict)

In [12]:
X_train, y_train = df2['text'], df2['label']
X_dev, y_dev = df1['text'], df1['label']
X_test, y_test = df3['text'], df3['label']

In [13]:
ordinal_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('reg', mord.LogisticIT())
])

In [16]:
ordinal_parameters = {
    'vect__ngram_range':[(1, 1), (1, 2), (1, 3), (2, 2)],
    'reg__alpha':[0.001, 0.01, 0.1, 1, 10, 100]
}

In [23]:
ordinal_grid_search = GridSearchCV(ordinal_pipeline, ordinal_parameters, cv=5, scoring="recall_macro", n_jobs = -1)

In [24]:
ordinal_grid_search.fit(X_train, y_train)

In [26]:
>>> for param_name in sorted(ordinal_parameters.keys()):
...     print("%s: %r" % (param_name, ordinal_grid_search.best_params_[param_name]))

reg__alpha: 0.1
vect__ngram_range: (1, 3)


In [27]:
from sklearn.metrics import recall_score

dev_predictions = ordinal_grid_search.predict(X_dev)
dev_score = recall_score(y_dev, dev_predictions, average = 'macro')
print("Average Recall on Dev set: ", dev_score)

Average Recall on Dev set:  0.45591022012723875


In [29]:
test_predictions = ordinal_grid_search.predict(X_test)
test_score = recall_score(y_test, test_predictions, average = 'macro')
print("Average Recall on Test set: ", test_score)

Average Recall on Test set:  0.43830829788835696
