<a href="https://colab.research.google.com/github/umaojha/sarcasm/blob/main/Working_sarcasm_fastext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sarcasm Detection on Swami Dataset using TF-IDF and FastText Embeddings

This notebook demonstrates classical machine learning and embedding-based
experiments on the Swami sarcasm dataset. It is meant to provide clarity and
reproducibility for the preprocessing and classification pipeline.

The notebook covers:
1. Loading the Swami dataset  
2. Preprocessing (cleaning + normalization)  
3. TF-IDF feature extraction  
4. Traditional machine learning classifiers  
5. fastText embedding training  
6. fastText vector averaging + ML classifier  
7. Evaluation on accuracy, F1-score, confusion matrix  


In [None]:
import pandas as pd
data= pd.read_csv("https://raw.githubusercontent.com/rajnish8807riday/Sarcasm_multilingual/main/cm_hinglish_f.csv")
data.head(3)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,Triple Talaq par Burbak Kuchh nahi bolega,0
1,1,Batao ye uss site pr se akki sir ke verdict ni...,1
2,2,Hindu baheno par julam bardas nahi hoga @Tripl...,0


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Cleaning Data Tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stopwords = stopwords.words('english')

# Sentiment Analysis
!pip install neattext
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import neattext.functions as nfx
from textblob import TextBlob
#import emoji

from gensim.models.phrases import Phrases, Phraser

# Word Embedding
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #for TF-IDF
from sklearn.feature_extraction.text import CountVectorizer  #For Bag of words
from gensim.models import Word2Vec  #For Word2Vec
from gensim.models import FastText  #For Fast Text

# Scaling and Evaluation Methods
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# ML Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


**Preprocessing step**

In [None]:
#input
# 1  aaaa
# 2  wwwwwwww
# 3  helloooooooo

# output
# 1
# 2
# 3  hello

def clean(string):
    if (len(string)==0):
        return ''
    if (set(string) == set(string[0])):
        return ''
    prev = None
    letters = [l for l in string]
    counter = 1
    new = []
    for l in letters:
        if l==prev:
            counter+=1
        else:
            if (counter==2):
                new.append(prev)
            counter=1
            new.append(l)
            prev = l
    return ''.join(new)


In [None]:
#Cleaning Text: Multiple hashtags
#data['clean_tweet'] = data['text'].apply(nfx.remove_hashtags)

# Cleaning Text: userhandles
data['clean_tweet'] = data['text'].apply(lambda x: nfx.remove_userhandles(x))

# Cleaning Text : Remove urls
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_urls)

# Cleaning Text : custom remove special characters ('#',':', ',', ';', '.', '|','-','_','^', [&amp, &yen, ....])
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: nfx.remove_custom_pattern(x,':+|\,+|\;+|\.+|\"+|\|+|\-+|\_+|\#+|\%+|\^|\*|\&[a-zA-Z]*'))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: nfx.remove_custom_words(x,'\n'))

# Cleaning Text: Punctuations
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_puncts)
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_punctuations)

# Cleaning Text: dates
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_dates)

# Cleaning Text: Emails
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_emails)

# Cleaning Text: Numbers
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_numbers)

data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_special_characters)

#Remove words made up of repetitive letters
data['clean_tweet'] = data['clean_tweet'].fillna('').map(clean)

# Cleaning Text: Multiple WhiteSpaces
#data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_multiple_spaces)
print(data.text[58])
print("=====")
print(data.clean_tweet[58])

Agar yahi chlta rha...to #BOLLYWOOD Ka future khatre mein hai
=====
Agar yahi chlta rha to BOLLYWOOD Ka future khatre mein hai


finding sentiments using vaser analyser

In [None]:
vader_obj = SentimentIntensityAnalyzer()


In [None]:
def get_sentiment(tweet):

    text = emoji.demojize(tweet, delimiters=("", "")).replace("_" , " ")

    blob = TextBlob(text)
    sentiment_dict = vader_obj.polarity_scores(text)

    Compound = sentiment_dict['compound']
    sentiment_subjectivity = blob.sentiment.subjectivity

    if sentiment_subjectivity >= 0.25:
        if Compound >= 0.05:
            sentiment_label = 'Positive'
        elif Compound <= - 0.05:
            sentiment_label = 'Negative'
        else:
            sentiment_label = 'Neutral'
    else:
        sentiment_label = 'Objective'


    return sentiment_label


In [None]:
# Text
ex1 = data['clean_tweet'][4400]
ex1

'Alka ji ise kehte hai politics bhi aur palti bhi'

In [None]:
!pip install emoji
import emoji
get_sentiment(ex1)


Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl.metadata (5.4 kB)
Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.12.1


'Objective'

In [None]:
%%time
data['sentiment'] = data['clean_tweet'].apply(get_sentiment)

CPU times: user 3.4 s, sys: 15.1 ms, total: 3.42 s
Wall time: 4.42 s


In [None]:
#finding the data structure
data.head(67)

Unnamed: 0.1,Unnamed: 0,text,label,clean_tweet,sentiment
0,0,Triple Talaq par Burbak Kuchh nahi bolega,0,Triple Talaq par Burbak Kuchh nahi bolega,Objective
1,1,Batao ye uss site pr se akki sir ke verdict ni...,1,Batao ye uss site pr se akki sir ke verdict ni...,Objective
2,2,Hindu baheno par julam bardas nahi hoga @Tripl...,0,Hindu baheno par julam bardas nahi hoga Hindu ...,Objective
3,3,Naa bhai.. aisa nhi hai.. mere handle karne se...,0,Naa bhai aisa nhi hai mere handle karne se bhi...,Neutral
4,4,#RememberingRajiv aaj agar musalman auraten tr...,0,RememberingRajiv aaj agar musalman auraten tri...,Objective
...,...,...,...,...,...
62,62,Rastra Ke Liye Unhone Jo Kiya Wo Koi Bollywood...,0,Rastra Ke Liye Unhone Jo Kiya Wo Koi Bollywood...,Objective
63,63,Bhutto ko maray 50 saal hogaye lakin ajj bhi b...,0,Bhutto ko maray saal hogaye lakin ajj bhi bhu...,Objective
64,64,Maiden shabash bas ab aisay he khelo -.- #Sarcasm,1,Maiden shabash bas ab aisay he khelo Sarcasm,Objective
65,65,#Reliance Arey Aap ki company ki advertisement...,1,Reliance Arey Aap ki company ki advertisement ...,Positive


In [None]:
# prompt: Using dataframe data: sentiment  with labels

data[['sentiment', 'label']] # selecting the sentiment and label columns


Unnamed: 0,sentiment,label
0,Objective,0
1,Objective,1
2,Objective,0
3,Neutral,0
4,Objective,0
...,...,...
5245,Objective,0
5246,Objective,0
5247,Objective,0
5248,Objective,0


In [None]:
sarcasm=pd.read_csv("sarcasm.csv")
sarcasm.head()
hindi=pd.read_csv("hindi-all.csv")
#data=sarcasm
hindi.head()
sarcasm['hindi']=hindi['Hindi']
sarcasm.head()
data=sarcasm
data['clean_tweet']=data['clean_tweet'].str.lower()
data['English']=data['English'].str.lower()
data.head()


Unnamed: 0.1,Unnamed: 0,text,label,clean_tweet,English,hindi
0,0,Triple Talaq par Burbak Kuchh nahi bolega,0,triple talaq par burbak kuchh nahi bolega,burbak kuchh will not say anything on triple t...,तीन तलाक पर कुछ नहीं बोलेंगे बुरबक कुच्छ
1,1,Batao ye uss site pr se akki sir ke verdict ni...,1,batao ye uss site pr se akki sir ke verdict ni...,tell us from this site that akki sir's verdict...,इस साइट से हमें बताएं कि अक्की सर का फैसला वही...
2,2,Hindu baheno par julam bardas nahi hoga @Tripl...,0,hindu baheno par julam bardas nahi hoga hindu ...,"there will be no oppression on hindu sisters, ...","""हिन्दू बहनों पर कोई जुल्म नहीं होगा, हमारी हि..."
3,3,Naa bhai.. aisa nhi hai.. mere handle karne se...,0,naa bhai aisa nhi hai mere handle karne se bhi...,"naa brother, it is not like that, i have nothi...","""ना भाई, ऐसा नहीं है, मेरा राजनीति से कोई लेना..."
4,4,#RememberingRajiv aaj agar musalman auraten tr...,0,rememberingrajiv aaj agar musalman auraten tri...,"remembering rajiv, today if muslim women are f...","''राजीव को याद करते हुए, आज अगर मुस्लिम महिलाए..."


In [None]:
data['sentiment'] = data['label'].map({1:'sarcasm',  0:'No_sarcasm'})
data.to_csv("sarcasm_all.csv")
final_data = data.drop(["text"],axis=1)
final_data= data[['clean_tweet','sentiment','label','English','hindi']]
#adding polarity score
final_data['polarity']= data['clean_tweet'].apply(get_sentiment)
final_data.head(3)
#final_data.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['polarity']= data['clean_tweet'].apply(get_sentiment)


Unnamed: 0,clean_tweet,sentiment,label,English,hindi,polarity
0,triple talaq par burbak kuchh nahi bolega,No_sarcasm,0,burbak kuchh will not say anything on triple t...,तीन तलाक पर कुछ नहीं बोलेंगे बुरबक कुच्छ,Objective
1,batao ye uss site pr se akki sir ke verdict ni...,sarcasm,1,tell us from this site that akki sir's verdict...,इस साइट से हमें बताएं कि अक्की सर का फैसला वही...,Objective
2,hindu baheno par julam bardas nahi hoga hindu ...,No_sarcasm,0,"there will be no oppression on hindu sisters, ...","""हिन्दू बहनों पर कोई जुल्म नहीं होगा, हमारी हि...",Objective


In [None]:
#Stop-Word Removal, Lower Casing, Stemming, Tokenization..... (removed stemming since it was making word triple to tripl.... removing e from each word)

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
tags = "[^A-Za-z]+"

days=['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
months=['january','february','march', 'april','may','june','july','august','september','october','november','december']


def preprocess_text(sentence, stem = True):

    sentence = re.sub(tags,' ', str(sentence).lower())
    text = []
    w=""
    for word in sentence.split():

        if word not in stopwords + days + months and len(word) >= 3:

            if stem:
                w=lemmatizer.lemmatize(word)
                text.append(w)
                w=""
            else:
                text.append(word)

    return " ".join([str(i) for i in text])
print(f"Orignal Text : {final_data.clean_tweet[7]}")
print("\nAfter Preprocessed : \n")
print(f"Preprocessed Text : {preprocess_text(final_data.clean_tweet[7])}")


Orignal Text : bhai triple talaq se aap kya samjhte hai samjhaye aap zara agar triple talaq pta hota apko toh aisa nhi kehte

After Preprocessed : 

Preprocessed Text : bhai triple talaq aap kya samjhte hai samjhaye aap zara agar triple talaq pta hota apko toh aisa nhi kehte


In [None]:
%%time
final_data.clean_tweet = final_data.clean_tweet.map(preprocess_text)
final_data.head(3)

CPU times: user 1.37 s, sys: 5.87 ms, total: 1.38 s
Wall time: 3.77 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,clean_tweet,sentiment,label,English,hindi,polarity
0,triple talaq par burbak kuchh nahi bolega,No_sarcasm,0,burbak kuchh will not say anything on triple t...,तीन तलाक पर कुछ नहीं बोलेंगे बुरबक कुच्छ,Objective
1,batao us site akki sir verdict nikaal laaye ja...,sarcasm,1,tell us from this site that akki sir's verdict...,इस साइट से हमें बताएं कि अक्की सर का फैसला वही...,Objective
2,hindu baheno par julam bardas nahi hoga hindu ...,No_sarcasm,0,"there will be no oppression on hindu sisters, ...","""हिन्दू बहनों पर कोई जुल्म नहीं होगा, हमारी हि...",Objective


In [None]:
final_data.isnull().sum()


Unnamed: 0,0
clean_tweet,0
sentiment,0
label,0
English,0
hindi,0
polarity,0


In [None]:
# Use multiple classifiers and grid search for prediction
from xgboost import XGBClassifier
def ML_modeling(models, params, X_train, X_test, y_train, y_test):

    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():

        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=5, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)

        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))

    return
## Preparing to make a pipeline
models = {
    'SVM': SVC(),
    #'Random Forest Classifier': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'logistic regression' : LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    Xg boost': XGBClassifier()

}

params = {
    'SVM': { 'kernel': ['linear', 'rbf'] },
    #'Random Forest Classifier': {'criterion': ['gini', 'entropy']},
    'Naive Bayes': { 'alpha': [0.5, 1], 'fit_prior': [True, False] },
    'logistic regression' : {'max_iter':[2000]},
    'Decision Tree': { 'min_samples_split': [2, 5, 7] },
    'Gradient Boosting': { 'learning_rate': [0.05, 0.1] },
    'XG boost':{'n_estimators': range(60, 220, 40),'learning_rate': [0.1, 0.01, 0.05]}
}

In [None]:
from tkinter.constants import X
#we need first to split our data to train and test sets
final_data.reset_index(drop=True , inplace = True)
final_data
final_data.info()
#code-mixed code
tfidf=TfidfVectorizer(stop_words='english', ngram_range=(1,2))
clean_tweet1=tfidf.fit_transform(final_data['clean_tweet'])
X_train, X_test, y_train, y_test = train_test_split(clean_tweet1, final_data.sentiment, random_state=42, test_size=0.2)
#using hindi text using tf-idf
#tfidf=TfidfVectorizer(ngram_range=(1,2))
#hindi_tweet1=tfidf.fit_transform(final_data['hindi'])

#Extra line addes-----------------------
#polarity=final_data['polarity']
#polarity=tfidf.fit_transform(final_data['polarity'])
#polarity.size
#X = ['hindi_tweet', 'polarity']
#Y=final_data.sentiment
#X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

# till here..................
#this  line comment has to be removed



#X_train, X_test, y_train, y_test = train_test_split(hindi_tweet1, final_data.sentiment, random_state=42, test_size=0.2)
#using hindi text using tf-idf
#X_train
print(X_train.shape, X_test.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5250 entries, 0 to 5249
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   clean_tweet  5250 non-null   object
 1   sentiment    5250 non-null   object
 2   label        5250 non-null   int64 
 3   English      5250 non-null   object
 4   hindi        5250 non-null   object
 5   polarity     5250 non-null   object
dtypes: int64(1), object(5)
memory usage: 246.2+ KB
(4200, 31913) (1050, 31913)


# **For Hindi Text**

In [None]:
#using hindi text using tf-idf
#tfidf=TfidfVectorizer(ngram_range=(1,2))
#hindi_tweet1=tfidf.fit_transform(final_data['hindi'])

#Extra line addes-----------------------
#polarity=final_data['polarity']
#polarity=tfidf.fit_transform(final_data['polarity'])
#polarity.size
#X = ['hindi_tweet', 'polarity']
#Y=final_data.sentiment
#X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

# till here..................
#this  line comment has to be removed



#X_train, X_test, y_train, y_test = train_test_split(hindi_tweet1, final_data.sentiment, random_state=42, test_size=0.2)
#using hindi text using tf-idf
#X_train

In [None]:
%%time
# print("==============TF-IDF==============\n")
ML_modeling(models, params, X_train, X_test, y_train, y_test)


SVM : {'kernel': 'linear'}
Accuracy: 0.898 	Precision: 0.815 	Recall: 0.594 		F1: 0.627

Naive Bayes : {'alpha': 0.5, 'fit_prior': True}
Accuracy: 0.881 	Precision: 0.542 	Recall: 0.502 		F1: 0.476

logistic regression : {'max_iter': 2000}
Accuracy: 0.886 	Precision: 0.727 	Recall: 0.533 		F1: 0.535

Decision Tree : {'min_samples_split': 2}
Accuracy: 0.897 	Precision: 0.754 	Recall: 0.682 		F1: 0.709

Gradient Boosting : {'learning_rate': 0.05}
Accuracy: 0.904 	Precision: 0.803 	Recall: 0.647 		F1: 0.688

CPU times: user 4min 43s, sys: 2.38 s, total: 4min 46s
Wall time: 4min 54s


# Fast-Text Embedding

**Bigrams¶
We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.**


As Phrases() takes a list of list of words as input:



In [None]:
#for code-mixed #use this code
#sent = [row.split() for row in final_data['clean_tweet']]
#for hindi use this code
sent = [row.split() for row in final_data['hindi']]

Creates the relevant phrases from the list of sentences:




In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)


The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:



In [None]:
bigram = Phraser(phrases)


Transform the corpus based on the bigrams detected:



In [None]:
sentences = bigram[sent]


In [None]:
sentences_df = pd.DataFrame({"sentences":sentences,"sentiment":final_data.sentiment})


Gensim FastText Implementation¶
**bold text**

In [None]:
fasttext_model = FastText(sentences=sentences,
                          window=5,
                          vector_size=64,
                          min_count=5,
                          sample=6e-5,
                          alpha=0.03,
                          min_alpha=0.0007,
                          epochs=100
                         )

In [None]:
fasttext_model.build_vocab(sentences, progress_per=20000)




Training of the model:



In [None]:
fasttext_model.train(sentences, total_examples=fasttext_model.corpus_count,epochs=100, report_delay=1)





(2695628, 9349300)

In [None]:
fasttext_model.wv.most_similar("cricket")


[('team', 0.9716107249259949),
 ('aaj', 0.9607118964195251),
 ('dekhne', 0.950282871723175),
 ('tujhe', 0.9383115768432617),
 ('ni', 0.9355538487434387),
 ('sakta', 0.9351341724395752),
 ('aur', 0.934432864189148),
 ('bhaiya', 0.9313564300537109),
 ('khel', 0.9305294752120972),
 ('toh', 0.9303534030914307)]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sentences_df.sentences, sentences_df.sentiment, test_size=0.2, shuffle=True)

In [None]:
ftv = dict(zip(fasttext_model.wv.index_to_key, fasttext_model.wv.vectors))


In [None]:
class Vectorizer(object):

    def __init__(self, vec):
        self.vec = vec
        self.dim = 64    #change the vector size according to parameter in Fastext

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean([self.vec[w] for w in words if w in self.vec] or [np.zeros(self.dim)], axis=0) for words in X])


## for any Classifier , we need intialise the model with the parameters.
## Further I am applying GridSearchCV for 5 runs (i.e 1/5th data used each time for testing)
## So the model gets trained over 5 runs
## as well we are predicting also over 5 runs
class Classifier(object):

    def __init__(self, model, param):
        self.model = model
        self.param = param
        self.gs = GridSearchCV(self.model, self.param, cv=5, error_score=0, refit=True)

    def fit(self, X, y):
        return self.gs.fit(X, y)

    def predict(self, X):
        return self.gs.predict(X)


## Preparing to make a pipeline
## What to know about Pipelining : see this https://www.youtube.com/watch?v=Y4iJfKX_QeQ&t=52s
clf_models = {
    'SVM': SVC(),
    #'Random Forest Classifier': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'logistic regression' : LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}
clf_params = {
    'SVM': { 'kernel': ['linear', 'rbf'] },
    #'Random Forest Classifier': {'criterion': ['gini', 'entropy'] },
    'Naive Bayes': { },
    'logistic regression' : {'max_iter':[2000]},
    'Decision Tree': { 'min_samples_split': [2, 5] },
    'Gradient Boosting': { 'learning_rate': [0.05, 0.1]}
}

In [None]:
%%time
import warnings
warnings.filterwarnings("ignore")

print("==============FastText==============\n")
## for loop traverses , each and every classifier and its corresponding parameters.
for key in clf_models.keys():

    clf = Pipeline([('FastText vectorizer', Vectorizer(ftv)), ('Classifier', Classifier(clf_models[key], clf_params[key]))])

    clf.fit(X_train, y_train)  ## Note : we are calling user defined fit method. This fit method uses Cross Validation
    y_pred = clf.predict(X_test)  ## Note : we are calling user defined predict method. This predict method uses Cross Validation

    ## printing performance metrics for each classifier
    print(key, ':')
    print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))



SVM :
Accuracy: 0.923 	Precision: 0.832 	Recall: 0.575 		F1: 0.608

Naive Bayes :
Accuracy: 0.845 	Precision: 0.583 	Recall: 0.618 		F1: 0.594

logistic regression :
Accuracy: 0.919 	Precision: 0.746 	Recall: 0.623 		F1: 0.659

Decision Tree :
Accuracy: 0.858 	Precision: 0.586 	Recall: 0.605 		F1: 0.594

Gradient Boosting :
Accuracy: 0.923 	Precision: 0.775 	Recall: 0.626 		F1: 0.665

CPU times: user 2min 15s, sys: 846 ms, total: 2min 16s
Wall time: 2min 17s
