In [721]:
# pip install --upgrade spacy

In [722]:
# !python -m spacy download en_core_web_md

In [723]:
# pip install podium-nlp

In [724]:
# pip install hdbscan

In [725]:
# pip install bertopic

In [726]:
# pip install emoji

In [727]:
# pip install wordsegment

In [728]:
# pip install ekphrasis -U

In [729]:
# pip install contextualSpellCheck

In [730]:
from tqdm import tqdm
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from bertopic import BERTopic
from podium import Vocab, Field, LabelField
from podium.datasets import TabularDataset
from podium.vectorizers import GloVe
from nltk.sentiment import SentimentIntensityAnalyzer
from copy import deepcopy
from nltk.tokenize.casual import TweetTokenizer
from time import time
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.segmenter import Segmenter

import contextualSpellCheck
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
import re
import emoji
import warnings
warnings.filterwarnings("ignore")

In [731]:
# from google.colab import drive
# drive.mount('/content/drive')

In [732]:
# %cd drive/MyDrive/datasets/

# Preprocessing

In [733]:
nlp = spacy.load("en_core_web_md")

In [734]:
df = pd.read_csv("train/SemEval2018-T3-train-taskA.txt", sep='\t', lineterminator='\n', encoding='utf-8')
df_test = pd.read_csv("goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt", sep='\t', lineterminator='\n', encoding='utf-8')
df_replace = pd.read_csv("test_TaskA/SemEval2018-T3_input_test_taskA.txt", sep='\t', lineterminator='\n', encoding='utf-8')

In [735]:
df.rename({'Tweet text\r': 'Tweet text'}, inplace=True, axis=1)
df_test.rename({'Tweet text\r': 'Tweet text'}, inplace=True, axis=1)
df_replace.rename({'Tweet text\r': 'Tweet text'}, inplace=True, axis=1)

## Getting clean text

In [736]:
tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True)
df_tmp = df['Tweet text'].apply(tokenizer.tokenize)
df_test_tmp = df['Tweet text'].apply(tokenizer.tokenize)

In [737]:
df['Tweet text'] = df['Tweet text'].apply(emoji.demojize)
df_test['Tweet text'] = df_test['Tweet text'].apply(emoji.demojize)

In [738]:
def emoji_counter(s):
    return len(emoji.emoji_lis(emoji.emojize(s)))

df['emoji_count'] = df['Tweet text'].apply(emoji_counter)

In [739]:
# def filter_emojis(s):
#     print(f'S: {s}')
#     print()
#     d = emoji.emoji_lis(emoji.emojize(s))
#     s_new = ''
#     if d != []:
#         for i, el in enumerate(d):
#             print(i, el)
#             if i == 0:
#                 s_new += s[:el['location']]
#             elif i < len(d):
#                 up_to = sum([k['location'] + len(emoji.demojize(k['emoji'])) for k in d[:i]])
#                 len_text = el['location'] - (d[i-1]['location'] + 1)
#                 s_new += s[up_to : up_to + len_text]
#     return s

In [740]:
# seg_eng = Segmenter(corpus="english") 
seg_tw = Segmenter(corpus="twitter")

Reading twitter - 1grams ...
Reading twitter - 2grams ...


In [741]:
def separate_hashtags(s):
    '''Removes the hashtag sign and segments the hashtag text.'''
    hashtags = []
    
    l = []
    for i, s_i in enumerate(s):
        if s_i.startswith('#'):
            tmp = tokenizer.tokenize(seg_tw.segment(s_i.replace('#', '')))
            l.extend(tmp)
        else:
            l.append(s_i)
    return l
            
df_tmp = df_tmp.apply(separate_hashtags)
df_test_tmp = df_test_tmp.apply(separate_hashtags)

In [745]:
def preprocess_words(s):
    '''
    Removes tags, emojis, links, smiley faces, | signs, stopwords and changes the case to lower.
    '''
    ret_list = []

    smiley_regex = r'([\:\;\=][()PDO\/\]\[p|]+)+'
    
    is_tag = lambda w: w.startswith('@')
    is_vertical_line = lambda w: w.startswith('|')
    is_emoji = lambda w: w != ':' and w.startswith(':') and w.endswith(':')
    remove_emoji = lambda w: w[:w.index(':')] + w[w.rindex(':')+1:] if ':' in w and w.index(':') != w.rindex(':') else w
    is_link = lambda w: w.startswith("http") or w.startswith("https")
    is_hashtag = lambda w: w.startswith("#")
    is_smiley = lambda w: re.match(smiley_regex, w)

    w2 = []
    for i, w in enumerate(s):
        if is_tag(w) or is_emoji(w) or is_link(w) or is_vertical_line(w):
            continue

        elif is_hashtag(w):
            w_tmp = w.replace('#', '')
            if w_tmp != '':
                lower_append(w_tmp, w2)

        elif is_smiley(w):
            w_tmp = re.sub(smiley_regex, '', w)
            if w_tmp != '':
                lower_append(w_tmp, w2)

        else:
            w_tmp = w.replace('#', '')
            w_tmp = w_tmp.replace('|', '')
            w_tmp = w_tmp.replace('_', '')
            w_tmp = w_tmp.replace('...', '')
            if w_tmp != '':
                lower_append(w_tmp, w2)

    return ' '.join([i for i in w2 if len(i) > 2])

def lower_append(w, l):
    l.append(w.lower())

df['clean_text'] = df_tmp.apply(preprocess_words)
df_test['clean_text'] = df_test_tmp.apply(preprocess_words)

In [746]:
print(df.shape)
df.head(15)

(3817, 5)


Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text
0,1,1,Sweet United Nations video. Just in time for C...,0,sweet united nations video just time for chris...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,are rumored have talked erv's agent and the an...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,hey there nice see you minnesota winter weather
3,4,0,3 episodes left I'm dying over here,0,episodes left i'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...,0,can't breathe was chosen the most notable quot...
5,6,0,You're never too old for Footie Pajamas. http:...,0,you're never too old for footie pajamas
6,7,1,Nothing makes me happier then getting on the h...,0,nothing makes happier then getting the highway...
7,8,0,4:30 an opening my first beer now gonna be a l...,0,4:30 opening first beer now gonna long night day
8,9,0,@Adam_Klug do you think you would support a gu...,0,you think you would support guy who knocked ou...
9,10,0,@samcguigan544 You are not allowed to open tha...,0,you are not allowed open that until christmas day


In [None]:
# def simple_preprocessing(s):
#     '''Lowercases, tokenizes, de-accents, removes words shorter than 3 and longer than 14 characters'''
#     return [' '.join(simple_preprocess(s_i)) for s_i in s]

# df['clean_text'] = df[['clean_text']].apply(simple_preprocessing)
# df_test['clean_text'] = df_test[['clean_text']].apply(simple_preprocessing)

In [747]:
print(df.shape)
df.head(15)

(3817, 5)


Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text
0,1,1,Sweet United Nations video. Just in time for C...,0,sweet united nations video just time for chris...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,are rumored have talked erv's agent and the an...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,hey there nice see you minnesota winter weather
3,4,0,3 episodes left I'm dying over here,0,episodes left i'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...,0,can't breathe was chosen the most notable quot...
5,6,0,You're never too old for Footie Pajamas. http:...,0,you're never too old for footie pajamas
6,7,1,Nothing makes me happier then getting on the h...,0,nothing makes happier then getting the highway...
7,8,0,4:30 an opening my first beer now gonna be a l...,0,4:30 opening first beer now gonna long night day
8,9,0,@Adam_Klug do you think you would support a gu...,0,you think you would support guy who knocked ou...
9,10,0,@samcguigan544 You are not allowed to open tha...,0,you are not allowed open that until christmas day


In [748]:
def remove_tuple_characters(s):
    return [re.sub(r'(.)\1{2,}', r'\1', w) for w in s]

df['clean_text'] = df[['clean_text']].apply(remove_tuple_characters)
df_test['clean_text'] = df_test[['clean_text']].apply(remove_tuple_characters)

print(df.shape)
df.head(15)

(3817, 5)


Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text
0,1,1,Sweet United Nations video. Just in time for C...,0,sweet united nations video just time for chris...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,are rumored have talked erv's agent and the an...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,hey there nice see you minnesota winter weather
3,4,0,3 episodes left I'm dying over here,0,episodes left i'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...,0,can't breathe was chosen the most notable quot...
5,6,0,You're never too old for Footie Pajamas. http:...,0,you're never too old for footie pajamas
6,7,1,Nothing makes me happier then getting on the h...,0,nothing makes happier then getting the highway...
7,8,0,4:30 an opening my first beer now gonna be a l...,0,4:30 opening first beer now gonna long night day
8,9,0,@Adam_Klug do you think you would support a gu...,0,you think you would support guy who knocked ou...
9,10,0,@samcguigan544 You are not allowed to open tha...,0,you are not allowed open that until christmas day


In [751]:
def lemmatize(s):
    '''Lemmatizes the words in the sentences and returns them if theyre not stopwords or punctuation'''
    return [[w.lemma_.lower() for w in nlp(s_i) if w.lemma_.lower() not in nlp.Defaults.stop_words and not w.is_punct] for s_i in s]

df['lemmas'] = df[['clean_text']].apply(lemmatize)
df_test['lemmas'] = df_test[['clean_text']].apply(lemmatize)

In [752]:
df.head(15)

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas
0,1,1,Sweet United Nations video. Just in time for C...,0,sweet united nations video just time for chris...,"[sweet, united, nations, video, time, christma..."
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,are rumored have talked erv's agent and the an...,"[rumor, talk, erv, agent, angel, ask, escobar,..."
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,hey there nice see you minnesota winter weather,"[hey, nice, minnesota, winter, weather]"
3,4,0,3 episodes left I'm dying over here,0,episodes left i'm dying over here,"[episode, leave, die]"
4,5,1,I can't breathe! was chosen as the most notabl...,0,can't breathe was chosen the most notable quot...,"[breathe, choose, notable, quote, year, annual..."
5,6,0,You're never too old for Footie Pajamas. http:...,0,you're never too old for footie pajamas,"[old, footie, pajama]"
6,7,1,Nothing makes me happier then getting on the h...,0,nothing makes happier then getting the highway...,"[happy, highway, break, light, light, like, ch..."
7,8,0,4:30 an opening my first beer now gonna be a l...,0,4:30 opening first beer now gonna long night day,"[4:30, open, beer, long, night, day]"
8,9,0,@Adam_Klug do you think you would support a gu...,0,you think you would support guy who knocked ou...,"[think, support, guy, knock, daughter, rice, d..."
9,10,0,@samcguigan544 You are not allowed to open tha...,0,you are not allowed open that until christmas day,"[allow, open, christmas, day]"


In [761]:
### baseline features ###
tokenizer2 = TweetTokenizer()
def word_counter(s):
    return len([x for x in tokenizer.tokenize(s) if not x.startswith(("@", "#", "http"))])

def char_counter(s):
    return len(s.replace(' ', ''))

def all_uppercase_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if x.isupper() and not x.startswith(("@", "#", "http"))])

def all_lowercase_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if x.islower() and not x.startswith(("@", "#", "http"))])

def capitalised_counter(s):
    return sum([i.istitle() for i in tokenizer2.tokenize(s)])

def digit_counter(s):
    return sum([i.isdigit() for i in s])



### other features ###
def tag_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if x.startswith("@")])

def hashtag_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if x.startswith("#")])

def link_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if x.startswith(('http:', 'https:'))])

def smiley_counter(s):
    return len([x for x in tokenizer2.tokenize(s) if re.match(r'([\:\;\=][()PDO\/\]\[p|]+)+', x)])

def exclamation_mark_counter(s):
    return s.count('!')

def question_mark_counter(s):
    return s.count('?')

def ellipsis_counter(s):
    return s.count('...')
    


### NER ###
def ORG_tag_counter(s):
    doc = nlp(s)
    return len([d.text for d in doc.ents if d.label_ == 'ORG'])

def NORP_tag_counter(s):
    doc = nlp(s)
    return len([d.text for d in doc.ents if d.label_ == 'NORP'])

def GPE_tag_counter(s):
    doc = nlp(s)
    return len([d.text for d in doc.ents if d.label_ == 'GPE'])

def PERSON_tag_counter(s):
    doc = nlp(s)
    return len([d.text for d in doc.ents if d.label_ == 'PERSON'])

In [762]:
def add_features(some_df):
    some_df['word_count'] = df['clean_text'].apply(word_counter)
    some_df['char_count'] = df['clean_text'].apply(char_counter)
    some_df['all_uppercase_count'] = df['Tweet text'].apply(all_uppercase_counter)
    some_df['all_lowercase_count'] = df['Tweet text'].apply(all_lowercase_counter)
    some_df['capitalised_count'] = df['Tweet text'].apply(capitalised_counter)
    some_df['digit_count'] = df['Tweet text'].apply(digit_counter)
    
    some_df['tag_count'] = df['Tweet text'].apply(tag_counter)
    some_df['hashtag_count'] = df['Tweet text'].apply(hashtag_counter)
    some_df['link_count'] = df['Tweet text'].apply(link_counter)
    some_df['smiley_count'] = df['Tweet text'].apply(smiley_counter)
    
    some_df['exclamation_mark_count'] = df['Tweet text'].apply(exclamation_mark_counter)
    some_df['question_mark_count'] = df['Tweet text'].apply(question_mark_counter)
    some_df['ellipsis_count'] = df['Tweet text'].apply(ellipsis_counter)
    
    some_df['ORG_tag_count'] = df['Tweet text'].apply(ORG_tag_counter)
    some_df['NORP_tag_count'] = df['Tweet text'].apply(NORP_tag_counter)
    some_df['GPE_tag_count'] = df['Tweet text'].apply(GPE_tag_counter)
    some_df['PERSON_tag_count'] = df['Tweet text'].apply(PERSON_tag_counter)
    
add_features(df)
add_features(df_test)

In [763]:
df.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,hashtag_count,link_count,smiley_count,exclamation_mark_count,question_mark_count,ellipsis_count,ORG_tag_count,NORP_tag_count,GPE_tag_count,PERSON_tag_count
0,1,1,Sweet United Nations video. Just in time for C...,0,sweet united nations video just time for chris...,"[sweet, united, nations, video, time, christma...",10,58,0,4,...,2,1,0,0,0,0,1,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,are rumored have talked erv's agent and the an...,"[rumor, talk, erv, agent, angel, ask, escobar,...",15,78,0,14,...,0,0,1,0,0,2,0,0,0,2
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,hey there nice see you minnesota winter weather,"[hey, nice, minnesota, winter, weather]",8,40,1,4,...,0,0,0,1,0,0,0,0,1,0
3,4,0,3 episodes left I'm dying over here,0,episodes left i'm dying over here,"[episode, leave, die]",6,28,0,5,...,0,0,0,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,can't breathe was chosen the most notable quot...,"[breathe, choose, notable, quote, year, annual...",16,88,1,20,...,0,0,0,1,0,0,1,0,0,0


In [764]:
df_train = df.sample(frac=0.7)
df_validation = df.drop(df_train.index)

In [765]:
df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,hashtag_count,link_count,smiley_count,exclamation_mark_count,question_mark_count,ellipsis_count,ORG_tag_count,NORP_tag_count,GPE_tag_count,PERSON_tag_count
2694,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,11,1,0,0,0,0,2,0,1,0
1184,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,0,0,0,0,0,0,1,0,0,0
1267,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,0,0,0,0,0,0,0,0,0,0
1766,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,2,1,0,0,0,0,3,0,0,0
2586,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,0,0,0,0,0,0,0,0,0,0


In [766]:
def new_df_with_all_features(df1, df2):
    cols_to_add = list(set(df2.columns.tolist()) - set(df1.columns.tolist()))
    if 'Tweet index' in cols_to_add:
        cols_to_add.remove('Tweet index')
    new_df = pd.concat((df1.copy(), df2[cols_to_add]), axis=1)
    return new_df

In [767]:
def join_docs(s):
    '''Joins the strings inside the inner list of a nested list'''
    return ' '.join(s)

df_train['topic_text'] = df_train['lemmas'].apply(join_docs)
df_validation['topic_text'] = df_validation['lemmas'].apply(join_docs)
df_test['topic_text'] = df_test['lemmas'].apply(join_docs)

In [768]:
df_train.shape

(2672, 24)

In [769]:
df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,link_count,smiley_count,exclamation_mark_count,question_mark_count,ellipsis_count,ORG_tag_count,NORP_tag_count,GPE_tag_count,PERSON_tag_count,topic_text
2694,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,1,0,0,0,0,2,0,1,0,opec chief defend policy group try ride price ...
1184,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,0,0,0,0,0,1,0,0,0,provide source
1267,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,0,0,0,0,0,0,0,0,0,sick
1766,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,1,0,0,0,0,3,0,0,0,quel domage canada bills series colossal flop ...
2586,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,0,0,0,0,0,0,0,0,0,yeah stressed word easily punch face right smi...


## Topic modeling baselines

In [None]:
count_vectorizer = CountVectorizer(
    analyzer='word',
    min_df=20,
    max_df=0.5,
)

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    min_df=20,
    max_df=0.5,
)

tweet_text_count_train = count_vectorizer.fit_transform(df_train['topic_text'])
tweet_text_count_validation = count_vectorizer.transform(df_validation['topic_text'])
tweet_text_count_test = count_vectorizer.transform(df_test['topic_text'])

tweet_text_tfidf_train = tfidf_vectorizer.fit_transform(df_train['topic_text'])
tweet_text_tfidf_validation = tfidf_vectorizer.transform(df_validation['topic_text'])
tweet_text_tfidf_test = tfidf_vectorizer.transform(df_test['topic_text'])

In [None]:
inertia = []
range_ = list(range(2, 60))
for i in range_:
    model = KMeans(i)
    model.fit(tweet_text_count_train)
    inertia.append(model.inertia_)
    
plt.plot(range_, inertia)
plt.xlabel('Number of components')
plt.ylabel('Inertia')
plt.show()

In [None]:
model = KMeans(17)
model.fit(tweet_text_count_train)

kmeans_count_labels_train = model.predict(tweet_text_count_train)
kmeans_count_labels_validation = model.predict(tweet_text_count_validation)
kmeans_count_labels_test = model.predict(tweet_text_count_test)

In [None]:
inertia = []
for i in range_:
    model = KMeans(i)
    model.fit(tweet_text_tfidf_train)
    inertia.append(model.inertia_)
plt.plot(range_, inertia)
plt.xlabel('Number of components')
plt.ylabel('Inertia')
plt.show()

In [None]:
model = KMeans(22)
model.fit(tweet_text_tfidf_train)

kmeans_tfidf_labels_train = model.predict(tweet_text_tfidf_train)
kmeans_tfidf_labels_validation = model.predict(tweet_text_tfidf_validation)
kmeans_tfidf_labels_test = model.predict(tweet_text_tfidf_test)

### BERTopic

In [771]:
df_train.reset_index(drop=True, inplace=True)
df_validation.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [772]:
start = time()
topic_model = BERTopic(top_n_words=10, min_topic_size=20)
topics, probs = topic_model.fit_transform(df_train['topic_text'])
end = time()
print(end-start)
pred_train = topic_model.transform(df_train['topic_text'])
pred_validation = topic_model.transform(df_validation['topic_text'])
pred_test = topic_model.transform(df_test['topic_text'])

24.361565351486206


In [773]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1172,-1_like_day_love_work
1,0,343,0_money_police_black_people
2,1,157,1_game_win_team_play
3,2,121,2_love_closedeye_facethrowingakiss_flushedface
4,3,111,3_mean_know_funny_glad
5,4,102,4_christmas_gift_holiday_sweater
6,5,97,5_sleep_wake_morning_hour
7,6,92,6_twitter_tweet_talk_retweet
8,7,76,7_day_week_monday_friday
9,8,72,8_drink_food_turkey_hot


In [None]:
# df_train_bertopic = df_train.copy()
# df_validation_bertopic = df_validation.copy()
# df_test_bertopic = df_test.copy()

In [774]:
df_train['topic'] = pred_train[0]
df_validation['topic'] = pred_validation[0]
df_test['topic'] = pred_test[0]

In [None]:
# df_train_bertopic.head()

## Embedding

In [775]:
tweet_embedding_train = df_train[['topic_text', 'Label']]
tweet_embedding_validation = df_validation[['topic_text', 'Label']]
tweet_embedding_test = df_test[['topic_text', 'Label']]

In [776]:
tweet_embedding_train.reset_index(drop=True, inplace=True)
tweet_embedding_validation.reset_index(drop=True, inplace=True)
tweet_embedding_test.reset_index(drop=True, inplace=True)

In [777]:
max_vocab_size = 10_000
vocab = Vocab(max_size=max_vocab_size, min_freq=20)

TWEET = Field('text', numericalizer=vocab)
LABEL = LabelField('Label')

fields = [TWEET, LABEL]

train = TabularDataset.from_pandas(df_train[['topic_text', 'Label']], fields)
validation = TabularDataset.from_pandas(df_validation[['topic_text', 'Label']], fields)
test = TabularDataset.from_pandas(df_test[['topic_text', 'Label']], fields)
train.finalize_fields()

glove = GloVe()
embeddings = glove.load_vocab(vocab)

train_batch = train.batch(add_padding=True)
validation_batch = validation.batch(add_padding=True)
test_batch = test.batch(add_padding=True)

100%|███████████████████████████████████████████████████████████████████████████████| 862M/862M [02:58<00:00, 4.82MB/s]


In [778]:
train_batch['text'].astype(int)[0:5]

array([[ 0,  0,  0,  0,  0, 41,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1],
       [ 0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1],
       [ 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1],
       [46,  0, 67,  0,  0, 56, 21,  0,  0,  0,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1]])

In [779]:
tweet_train = embeddings[train_batch['text'].astype(int)]
tweet_validation = embeddings[validation_batch['text'].astype(int)]
tweet_test = embeddings[test_batch['text'].astype(int)]

# Mean
tweet_train_mean = tweet_train.mean(axis=1)
tweet_validation_mean = tweet_validation.mean(axis=1)
tweet_test_mean = tweet_test.mean(axis=1)

In [780]:
embeddings_train_mean_df = pd.DataFrame(tweet_train_mean)
df_train = pd.merge(df_train, embeddings_train_mean_df, left_index=True, right_index=True)

embeddings_validation_mean_df = pd.DataFrame(tweet_validation_mean)
df_validation = pd.merge(df_validation, embeddings_validation_mean_df, left_index=True, right_index=True)

embeddings_test_mean_df = pd.DataFrame(tweet_test_mean)
df_test = pd.merge(df_test, embeddings_test_mean_df, left_index=True, right_index=True)

In [781]:
df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,290,291,292,293,294,295,296,297,298,299
0,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,-0.133291,0.373946,-0.605286,-0.488337,-1.580123,0.299884,0.277527,1.056188,-0.905541,-0.476995
1,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,-0.498801,0.739389,-0.78242,-0.76594,-1.578567,0.666944,-0.028898,0.991089,-1.259082,-0.359712
2,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,-0.554514,0.789464,-0.803408,-0.80683,-1.564054,0.717324,-0.079272,0.97356,-1.306909,-0.336406
3,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,-0.053098,0.338785,-0.614522,-0.438817,-1.694665,0.263905,0.374092,1.131317,-0.876467,-0.546164
4,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,-0.169351,0.400346,-0.550667,-0.469496,-1.365752,0.274819,0.222147,0.925074,-0.829983,-0.366258


In [782]:
## Features
## Broj neg rijeci
## Broj poz rijeci
## Omjer
## Udaljenost izmedu poz i neg rijeci

In [783]:
nltk.download('vader_lexicon') # if error run this

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [784]:
def pos_neg_words(df, limit):
    sid = SentimentIntensityAnalyzer()

    neg_words = []
    neg_word_count = []
    pos_words = []
    pos_word_count = []
    for index, row in df.iterrows():
        lemmas = []
        if len(row['topic_text']) > 0:
            doc = nlp(row['topic_text'])
            for token in doc:
                lemmas.append(token.lemma_)

            current_pos = []
            current_neut = []
            current_neg = []
            for word in lemmas:
                if (sid.polarity_scores(word)['compound']) >= limit:
                    current_pos.append(word)
                elif (sid.polarity_scores(word)['compound']) <= -limit:
                    current_neg.append(word)
                else:
                    current_neut.append(word)

            neg_words.append(deepcopy(current_neg))
            neg_word_count.append(deepcopy(len(current_neg)))
            pos_words.append(deepcopy(current_pos))
            pos_word_count.append(deepcopy(len(current_pos)))
        else:
            neg_words.append([])
            neg_word_count.append(0)
            pos_words.append([])
            pos_word_count.append(0)
    return neg_words, neg_word_count, pos_words, pos_word_count

In [785]:
neg_words_train, neg_word_count_train, pos_words_train, pos_word_count_train = pos_neg_words(df_train, 0.2)
neg_words_val, neg_word_count_val, pos_words_val, pos_word_count_val = pos_neg_words(df_validation, 0.2)
neg_words_test, neg_word_count_test, pos_words_test, pos_word_count_test = pos_neg_words(df_test, 0.2)

In [786]:
df_train['neg_word_count'] = neg_word_count_train
df_train['pos_word'] = pos_words_train
df_train['pos_word_count'] = pos_word_count_train
df_train['neg_word'] = neg_words_train

df_validation['neg_word_count'] = neg_word_count_val
df_validation['pos_word'] = pos_words_val
df_validation['pos_word_count'] = pos_word_count_val
df_validation['neg_word'] = neg_words_val

df_test['neg_word_count'] = neg_word_count_test
df_test['pos_word'] = pos_words_test
df_test['pos_word_count'] = pos_word_count_test
df_test['neg_word'] = neg_words_test

df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,294,295,296,297,298,299,neg_word_count,pos_word,pos_word_count,neg_word
0,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,-1.580123,0.299884,0.277527,1.056188,-0.905541,-0.476995,0,[],0,[]
1,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,-1.578567,0.666944,-0.028898,0.991089,-1.259082,-0.359712,0,[],0,[]
2,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,-1.564054,0.717324,-0.079272,0.97356,-1.306909,-0.336406,1,[],0,[sick]
3,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,-1.694665,0.263905,0.374092,1.131317,-0.876467,-0.546164,2,[],0,"[flop, cancel]"
4,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,-1.365752,0.274819,0.222147,0.925074,-0.829983,-0.366258,1,"[yeah, easily]",2,[stressed]


In [787]:
topic_dummies = []
for i in topic_model.get_topic_info()['Topic'].values:
    topic_dummies.append(f'{i}_topic')
    
topic_dummies

['-1_topic',
 '0_topic',
 '1_topic',
 '2_topic',
 '3_topic',
 '4_topic',
 '5_topic',
 '6_topic',
 '7_topic',
 '8_topic',
 '9_topic',
 '10_topic',
 '11_topic',
 '12_topic',
 '13_topic',
 '14_topic',
 '15_topic',
 '16_topic',
 '17_topic']

In [788]:
df_train[topic_dummies] = pd.get_dummies(df_train['topic'])
df_validation[topic_dummies] = pd.get_dummies(df_validation['topic'])
df_test[topic_dummies] = pd.get_dummies(df_test['topic'])
df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,8_topic,9_topic,10_topic,11_topic,12_topic,13_topic,14_topic,15_topic,16_topic,17_topic
0,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,0,0,0,0,0,0,0,0,0,0
1,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,0,0,0,0,0,0,0,0,0,0
2,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,0,0,0,0,0,0,0,0,0,0
3,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,0,0,0,0,0,0,0,0,0,0
4,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,0,0,0,0,0,0,0,0,0,0


In [790]:
# X_ = df_train[['word_count', 'char_count', 'tag_count', 'hashtag_count', 'link_count', 'smiley_count', 'mark_count', 'has_emoji', 'neg_word_count', 'pos_word_count',
#                '-1_topic', '0_topic', '1_topic', '2_topic', '3_topic', '4_topic', '5_topic', '6_topic', '7_topic']]
# y_ = df_train['Label']
# clf = LogisticRegression(random_state=0, solver='liblinear').fit(X_, y_)

# print('Train score')
# print(clf.score(X_, y_))

# X_val = df_validation[['word_count', 'char_count', 'tag_count', 'hashtag_count', 'link_count', 'smiley_count', 'mark_count', 'has_emoji', 'neg_word_count', 'pos_word_count',
#                '-1_topic', '0_topic', '1_topic', '2_topic', '3_topic', '4_topic', '5_topic', '6_topic', '7_topic']]
# y_val = df_validation['Label']

# print('Validation score')
# print(clf.score(X_val, y_val))

# X_val = df_test[['word_count', 'char_count', 'tag_count', 'hashtag_count', 'link_count', 'smiley_count', 'mark_count', 'has_emoji', 'neg_word_count', 'pos_word_count',
#                '-1_topic', '0_topic', '1_topic', '2_topic', '3_topic', '4_topic', '5_topic', '6_topic', '7_topic']]
# y_val = df_test['Label']
# print('Test score')
# print(clf.score(X_val, y_val))

In [791]:
## Pos and neg words within 4 words
df_train[['clean_text', 'pos_word', 'pos_word_count', 'neg_word', 'neg_word_count']].iloc[25]

clean_text        whoever runs yeovil town twitter account shoul...
pos_word                                                     [like]
pos_word_count                                                    1
neg_word                                                     [fire]
neg_word_count                                                    1
Name: 25, dtype: object

In [792]:
def pos_neg_within_n(df, n=4):

    ret_array = []

    for index, row in df.iterrows():
        if row['pos_word_count'] > 0 and row['neg_word_count'] > 0:
            doc = nlp(row['clean_text'])
            lemmas = []
            for token in doc:
                lemmas.append(token.lemma_)

            pos_indexes = np.array([])
            for word in row['pos_word']:
                pos_indexes = np.append(pos_indexes, np.where(np.array(lemmas) == word))
            neg_indexes = np.array([])
            for word in row['neg_word']:
                neg_indexes = np.append(neg_indexes, np.where(np.array(lemmas) == word))

            bool_val = 0
            for idx in pos_indexes:
                if (abs(neg_indexes-idx) < n).any():
                    bool_val = 1
            ret_array.append(deepcopy(bool_val))
        else:
            ret_array.append(0)
    return ret_array

In [793]:
within_5_train = pos_neg_within_n(df_train, n=5)
within_5_val = pos_neg_within_n(df_validation, n=5)

In [794]:
df_train['pos_neg_within_5'] = within_5_train
df_validation['pos_neg_within_5'] = within_5_val

df_train.head()

Unnamed: 0,Tweet index,Label,Tweet text,emoji_count,clean_text,lemmas,word_count,char_count,all_uppercase_count,all_lowercase_count,...,9_topic,10_topic,11_topic,12_topic,13_topic,14_topic,15_topic,16_topic,17_topic,pos_neg_within_5
0,2698,0,"#OPEC #chief #defends #policy, #says #group to...",0,opec chief defends policy says group try ride ...,"[opec, chief, defend, policy, group, try, ride...",12,53,0,3,...,0,0,0,0,0,0,0,0,0,0
1,1185,0,yet you cant provide ANY source that shows it ...,0,yet you cant provide any source that shows,"[provide, source]",8,35,1,8,...,0,0,0,0,0,0,0,0,0,0
2,1268,1,also sick names,0,also sick names,[sick],3,13,0,3,...,0,0,0,0,0,0,0,0,0,0
3,1770,1,Quel domage RT @CFL_News: No Canada: Bills in...,0,quel domage canada bills series which was colo...,"[quel, domage, canada, bills, series, colossal...",14,69,2,11,...,0,0,0,0,0,0,0,0,0,0
4,2590,0,@PaulGoonerW yeah stressed isnt the word could...,1,yeah stressed isnt the word could easily punch...,"[yeah, stressed, word, easily, punch, face, ri...",16,101,0,17,...,0,0,0,0,0,0,0,0,0,1


In [795]:
X_ = df_train[[ 'pos_neg_within_5']]
y_ = df_train['Label']

clf = LogisticRegression().fit(X_, y_)
print('Train score')
print(clf.score(X_, y_))

X_val = df_validation[[ 'pos_neg_within_5']]
y_val = df_validation['Label']

print('Validation score')
print(clf.score(X_val, y_val))

Train score
0.5183383233532934
Validation score
0.5152838427947598


## Baseline classifiers

In [797]:
baseline_features = ['word_count', 'char_count', 'all_uppercase_count', 'all_lowercase_count', 'capitalised_count', 'digit_count']

y_train = df_train['Label']
x_train = df_train[baseline_features]

y_validation = df_validation['Label']
x_validation = df_validation[baseline_features]

y_test = df_test['Label']
x_test = df_test[baseline_features]

In [798]:
x_train.head()

Unnamed: 0,word_count,char_count,all_uppercase_count,all_lowercase_count,capitalised_count,digit_count
0,12,53,0,3,0,4
1,8,35,1,8,1,9
2,3,13,0,3,0,0
3,14,69,2,11,4,2
4,16,101,0,17,0,0


In [799]:
def model_fit_evaluate(model, x_train, y_train, x_validation, y_validation, x_test, y_test):
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    print('-------------------------- TRAIN --------------------------')
    print(classification_report(y_train, y_train_pred, digits=3), 2*'\n')
    
    y_validation_pred = model.predict(x_validation)
    print('----------------------- VALIDATION ------------------------')
    print(classification_report(y_validation, y_validation_pred, digits=3), 2*'\n')
    
    y_test_pred = model.predict(x_test)
    print('------------------------- TEST ---------------------------')
    print(classification_report(y_test, y_test_pred, digits=3), 2*'\n')

## Global baseline classifier

In [802]:
model = LogisticRegression()
model_fit_evaluate(model, x_train, y_train, x_validation, y_validation, x_test, y_test)

-------------------------- TRAIN --------------------------
              precision    recall  f1-score   support

           0      0.626     0.451     0.524      1337
           1      0.570     0.730     0.640      1335

    accuracy                          0.590      2672
   macro avg      0.598     0.590     0.582      2672
weighted avg      0.598     0.590     0.582      2672
 


----------------------- VALIDATION ------------------------
              precision    recall  f1-score   support

           0      0.636     0.437     0.518       579
           1      0.564     0.744     0.641       566

    accuracy                          0.589      1145
   macro avg      0.600     0.590     0.580      1145
weighted avg      0.600     0.589     0.579      1145
 


------------------------- TEST ---------------------------
              precision    recall  f1-score   support

           0      0.649     0.383     0.481       473
           1      0.422     0.685     0.522       31

## Global best classifier

In [813]:
pipeline_lr = Pipeline([
    ('selection', SelectKBest()), 
    ('scaler', StandardScaler()),
    ('pf', PolynomialFeatures()), 
    ('lr', LogisticRegression())
])

params = {
    'selection__k': list(range(2, 20, 3)),
    'pf__degree': [2, 3, 4, 5],
    'lr__penalty': ['l1', 'l2', 'elasticnet'],
    'lr__class_weight': ['balanced', None],
    'lr__C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 5],
    'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
}


search = GridSearchCV(pipeline_lr, param_grid=params, cv=5)
search.fit(x_train, y_train)

print(f'Best score: {search.best_score_}')
print(f'Best score: {search.best_params_}')

KeyboardInterrupt: 

In [None]:
pipeline_lr_best = Pipeline([
    ('selection', SelectKBest(k=5)), 
    ('scaler', StandardScaler()),
    ('pf', PolynomialFeatures(degree=3)),
    ('lr', LogisticRegression(C=0.1, penalty='l1', solver='liblinear'))
])

model_fit_evaluate(pipeline_lr_best, x_train, y_train, x_validation, y_validation, x_test, y_test)

In [809]:
pipeline_lr = Pipeline([
    #('selection', SelectKBest()), 
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression())
])

params = {
    #'selection__k': list(range(2, 20, 3)),
    'lr__penalty': ['l1'], 
    'lr__C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 5],
    'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
}


search = GridSearchCV(pipeline_lr, param_grid=params, cv=5)
search.fit(x_train, y_train)

print(f'Best score: {search.best_score_}')
print(f'Best score: {search.best_params_}')

Best score: 0.5916797927823865
Best score: {'lr__C': 0.1, 'lr__penalty': 'l1', 'lr__solver': 'saga'}


In [None]:
pipeline_lr_best = Pipeline([
    ('selection', SelectKBest(k=8)), 
    ('scaler', StandardScaler()),
    ('pf', PolynomialFeatures(degree=3)),
    ('lr', LogisticRegression(C=0.1, penalty='l1', solver='liblinear'))
])

model_fit_evaluate(pipeline_lr_best, x_train, y_train, x_validation, y_validation, x_test, y_test)

## Baseline classifiers for the first 4 topics

In [803]:
def get_train_validation_and_test_for_topic(topic_num, df_train, df_validation, df_test):
    df_train_topic = df_train[df_train[f'{topic_num}_topic'] == 1]
    df_validation_topic = df_train[df_train[f'{topic_num}_topic'] == 1]
    df_test_topic = df_train[df_train[f'{topic_num}_topic'] == 1]
    
    y_train = df_train_topic['Label']
    x_train = df_train_topic[baseline_features]

    y_validation = df_validation_topic['Label']
    x_validation = df_validation_topic[baseline_features]

    y_test = df_test_topic['Label']
    x_test = df_test_topic[baseline_features]

    return x_train, y_train, x_validation, y_validation, x_test, y_test

In [804]:
x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_ = \
    get_train_validation_and_test_for_topic(-1, df_train, df_validation, df_test)

model = LogisticRegression()
model_fit_evaluate(model, x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_)

-------------------------- TRAIN --------------------------
              precision    recall  f1-score   support

           0      0.617     0.501     0.553       599
           1      0.564     0.675     0.615       573

    accuracy                          0.586      1172
   macro avg      0.591     0.588     0.584      1172
weighted avg      0.591     0.586     0.583      1172
 


----------------------- VALIDATION ------------------------
              precision    recall  f1-score   support

           0      0.617     0.501     0.553       599
           1      0.564     0.675     0.615       573

    accuracy                          0.586      1172
   macro avg      0.591     0.588     0.584      1172
weighted avg      0.591     0.586     0.583      1172
 


------------------------- TEST ---------------------------
              precision    recall  f1-score   support

           0      0.617     0.501     0.553       599
           1      0.564     0.675     0.615       57

In [805]:
x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_ = \
    get_train_validation_and_test_for_topic(0, df_train, df_validation, df_test)

model = LogisticRegression()
model_fit_evaluate(model, x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_)

-------------------------- TRAIN --------------------------
              precision    recall  f1-score   support

           0      0.632     0.542     0.583       168
           1      0.613     0.697     0.652       175

    accuracy                          0.621       343
   macro avg      0.623     0.619     0.618       343
weighted avg      0.622     0.621     0.619       343
 


----------------------- VALIDATION ------------------------
              precision    recall  f1-score   support

           0      0.632     0.542     0.583       168
           1      0.613     0.697     0.652       175

    accuracy                          0.621       343
   macro avg      0.623     0.619     0.618       343
weighted avg      0.622     0.621     0.619       343
 


------------------------- TEST ---------------------------
              precision    recall  f1-score   support

           0      0.632     0.542     0.583       168
           1      0.613     0.697     0.652       17

In [806]:
x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_ = \
    get_train_validation_and_test_for_topic(1, df_train, df_validation, df_test)

model = LogisticRegression()
model_fit_evaluate(model, x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_)

-------------------------- TRAIN --------------------------
              precision    recall  f1-score   support

           0      0.606     0.679     0.640        84
           1      0.571     0.493     0.529        73

    accuracy                          0.592       157
   macro avg      0.589     0.586     0.585       157
weighted avg      0.590     0.592     0.589       157
 


----------------------- VALIDATION ------------------------
              precision    recall  f1-score   support

           0      0.606     0.679     0.640        84
           1      0.571     0.493     0.529        73

    accuracy                          0.592       157
   macro avg      0.589     0.586     0.585       157
weighted avg      0.590     0.592     0.589       157
 


------------------------- TEST ---------------------------
              precision    recall  f1-score   support

           0      0.606     0.679     0.640        84
           1      0.571     0.493     0.529        7

In [807]:
x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_ = \
    get_train_validation_and_test_for_topic(2, df_train, df_validation, df_test)

model = LogisticRegression()
model_fit_evaluate(model, x_train_, y_train_, x_validation_, y_validation_, x_test_, y_test_)

-------------------------- TRAIN --------------------------
              precision    recall  f1-score   support

           0      0.657     0.878     0.751        74
           1      0.591     0.277     0.377        47

    accuracy                          0.645       121
   macro avg      0.624     0.577     0.564       121
weighted avg      0.631     0.645     0.606       121
 


----------------------- VALIDATION ------------------------
              precision    recall  f1-score   support

           0      0.657     0.878     0.751        74
           1      0.591     0.277     0.377        47

    accuracy                          0.645       121
   macro avg      0.624     0.577     0.564       121
weighted avg      0.631     0.645     0.606       121
 


------------------------- TEST ---------------------------
              precision    recall  f1-score   support

           0      0.657     0.878     0.751        74
           1      0.591     0.277     0.377        4

## Grid search

In [None]:
weights = pipeline_lr_best['lr'].coef_.flatten()

plt.figure(figsize=(15, 15))
plt.bar(list(range(len(weights))), weights)
plt.ylabel('Value of the weight')
plt.xlabel('Index of the weight')
plt.title('Feature importance plot for logistic regression')
plt.show()

In [None]:
pipeline_dtc = Pipeline([
    ('selection', SelectKBest()), 
    ('dtc', DecisionTreeClassifier()),
])

params = {
    'selection__k': list(range(2, 20, 3)),
    'dtc__max_features': ['auto', 'sqrt', 'log2'],
    'dtc__ccp_alpha': [0.1, .01, .001],
    'dtc__max_depth' : list(range(2, 10)),
    'dtc__criterion' : ['gini', 'entropy', 'log_loss']
}

search = GridSearchCV(pipeline_dtc, param_grid=params, cv=5)
search.fit(x_train, y_train)

print(f'Best score: {search.best_score_}')
print(f'Best score: {search.best_params_}')

In [None]:
pipeline_dtc_best = Pipeline([
    ('selection', SelectKBest(k=5)), 
    ('dtc', DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=5, max_features='auto')),
])

model_fit_evaluate(pipeline_dtc_best, x_train, y_train, x_validation, y_validation, x_test, y_test)

In [None]:
feat_importances = pipeline_dtc_best['dtc'].feature_importances_.flatten()

plt.figure(figsize=(5, 5))
plt.bar(list(range(len(feat_importances))), feat_importances)
plt.ylabel('Feature importance value')
plt.xlabel('Index of the feature')
plt.title('Feature importance plot for decision tree')
plt.show()