In [2]:
!pip install transformers ekphrasis datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import re
import string
import nltk
import pandas as pd
import warnings
import mlflow
import contractions
import unidecode

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer
# from ekphrasis.dicts.emoticons import emoticons

from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

pd.options.display.max_colwidth = None
pd.options.display.max_rows = None
warnings.filterwarnings('ignore')

In [38]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

In [39]:
print(train.shape)
print(test.shape)

(7613, 5)
(3263, 4)


In [18]:
TRACKING_SERVER_HOST = "127.0.0.1"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

## Preprocess Text

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


Going over the dataset we find:

- The text contains punctuations, hashtags, numeric figures, mentions, URLs, Dates, Accented Characters, Contractions
- Text also contains characters like `&gt`, `&amp`, `\n`.
- We'll also find duplicated tweets, however there are some discrepancies between the duplicated tweets. For the same tweet we have different labels.

In [40]:
# Duplicated tweets (Expand to see more)
train[train.text.duplicated(keep=False)].sort_values(by='text').head()

Unnamed: 0,id,keyword,location,text,target
4290,6094,hellfire,"Jubail IC, Saudi Arabia.",#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,0
4299,6105,hellfire,?????? ??? ?????? ????????,#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,0
4312,6123,hellfire,?????? ???? ??????,#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect,1
6363,9095,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bestnaijamade bestnaijamade bestnaijamade beÛ_,1
6373,9107,suicide%20bomb,Nigeria,#Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bestnaijamade bestnaijamade bestnaijamade beÛ_,1


Let's start by cleaning up the text.

### Clearning up the text

In [41]:
# Drop the duplicates from the dataframe
train = train.drop_duplicates(subset=['text', 'target']).reset_index(drop=True)

# After manually going through the tweets with different target values
# Assign the target values to the tweets to find the duplicates
non_disaster =  [4253, 4182, 3212, 4249, 6535, 1190, 4239, 3936, 1214, 6018]
disaster = [4193, 2803, 4554, 4250, 1207, 4317, 620, 5573]
train.loc[non_disaster, 'target'] = 0
train.loc[disaster, 'target'] = 1

# Again drop the duplicates from the dataframe
train = train.drop_duplicates(subset=['text', 'target']).reset_index(drop=True)

In [42]:
# Check if any duplicated tweets left
train[train.text.duplicated(keep=False)].sort_values(by='text').head()

Unnamed: 0,id,keyword,location,text,target


In [43]:
EMOTICONS = {
    u":‑\)": "Happy face or smiley",
    u":\)": "Happy face or smiley",
    u":-\]": "Happy face or smiley",
    u":\]": "Happy face or smiley",
    u":-3": "Happy face smiley",
    u":3": "Happy face smiley",
    u":->": "Happy face smiley",
    u":>": "Happy face smiley",
    u"8-\)": "Happy face smiley",
    u":o\)": "Happy face smiley",
    u":-\}": "Happy face smiley",
    u":\}": "Happy face smiley",
    u":-\)": "Happy face smiley",
    u":c\)": "Happy face smiley",
    u":\^\)": "Happy face smiley",
    u"=\]": "Happy face smiley",
    u"=\)": "Happy face smiley",
    u":‑D": "Laughing, big grin or laugh with glasses",
    u":D": "Laughing, big grin or laugh with glasses",
    u"8‑D": "Laughing, big grin or laugh with glasses",
    u"8D": "Laughing, big grin or laugh with glasses",
    u"X‑D": "Laughing, big grin or laugh with glasses",
    u"XD": "Laughing, big grin or laugh with glasses",
    u"=D": "Laughing, big grin or laugh with glasses",
    u"=3": "Laughing, big grin or laugh with glasses",
    u"B\^D": "Laughing, big grin or laugh with glasses",
    u":-\)\)": "Very happy",
    u":‑\(": "Frown, sad, andry or pouting",
    u":-\(": "Frown, sad, andry or pouting",
    u":\(": "Frown, sad, andry or pouting",
    u":‑c": "Frown, sad, andry or pouting",
    u":c": "Frown, sad, andry or pouting",
    u":‑<": "Frown, sad, andry or pouting",
    u":<": "Frown, sad, andry or pouting",
    u":‑\[": "Frown, sad, andry or pouting",
    u":\[": "Frown, sad, andry or pouting",
    u":-\|\|": "Frown, sad, andry or pouting",
    u">:\[": "Frown, sad, andry or pouting",
    u":\{": "Frown, sad, andry or pouting",
    u":@": "Frown, sad, andry or pouting",
    u">:\(": "Frown, sad, andry or pouting",
    u":'‑\(": "Crying",
    u":'\(": "Crying",
    u":'‑\)": "Tears of happiness",
    u":'\)": "Tears of happiness",
    u"D‑':": "Horror",
    u"D:<": "Disgust",
    u"D:": "Sadness",
    u"D8": "Great dismay",
    u"D;": "Great dismay",
    u"D=": "Great dismay",
    u"DX": "Great dismay",
    u":‑O": "Surprise",
    u":O": "Surprise",
    u":‑o": "Surprise",
    u":o": "Surprise",
    u":-0": "Shock",
    u"8‑0": "Yawn",
    u">:O": "Yawn",
    u":-\*": "Kiss",
    u":\*": "Kiss",
    u":X": "Kiss",
    u";‑\)": "Wink or smirk",
    u";\)": "Wink or smirk",
    u"\*-\)": "Wink or smirk",
    u"\*\)": "Wink or smirk",
    u";‑\]": "Wink or smirk",
    u";\]": "Wink or smirk",
    u";\^\)": "Wink or smirk",
    u":‑,": "Wink or smirk",
    u";D": "Wink or smirk",
    u":‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S": "Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|": "Straight face",
    u":\|": "Straight face",
    u":$": "Embarrassed or blushing",
    u":‑x": "Sealed lips or wearing braces or tongue-tied",
    u":x": "Sealed lips or wearing braces or tongue-tied",
    u":‑#": "Sealed lips or wearing braces or tongue-tied",
    u":#": "Sealed lips or wearing braces or tongue-tied",
    u":‑&": "Sealed lips or wearing braces or tongue-tied",
    u":&": "Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)": "Angel, saint or innocent",
    u"O:\)": "Angel, saint or innocent",
    u"0:‑3": "Angel, saint or innocent",
    u"0:3": "Angel, saint or innocent",
    u"0:‑\)": "Angel, saint or innocent",
    u"0:\)": "Angel, saint or innocent",
    u":‑b": "Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)": "Angel, saint or innocent",
    u">:‑\)": "Evil or devilish",
    u">:\)": "Evil or devilish",
    u"\}:‑\)": "Evil or devilish",
    u"\}:\)": "Evil or devilish",
    u"3:‑\)": "Evil or devilish",
    u"3:\)": "Evil or devilish",
    u">;\)": "Evil or devilish",
    u"\|;‑\)": "Cool",
    u"\|‑O": "Bored",
    u":‑J": "Tongue-in-cheek",
    u"#‑\)": "Party all night",
    u"%‑\)": "Drunk or confused",
    u"%\)": "Drunk or confused",
    u":-###..": "Being sick",
    u":###..": "Being sick",
    u"<:‑\|": "Dump",
    u"\(>_<\)": "Troubled",
    u"\(>_<\)>": "Troubled",
    u"\(';'\)": "Baby",
    u"\(\^\^>``": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)": "Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz": "Sleeping",
    u"\(\^_-\)": "Wink",
    u"\(\(\+_\+\)\)": "Confused",
    u"\(\+o\+\)": "Confused",
    u"\(o\|o\)": "Ultraman",
    u"\^_\^": "Joyful",
    u"\(\^_\^\)/": "Joyful",
    u"\(\^O\^\)／": "Joyful",
    u"\(\^o\^\)／": "Joyful",
    u"\(__\)": "Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_": "Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>": "Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m": "Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)": "Sad or Crying",
    u"\(/_;\)": "Sad or Crying",
    u"\(T_T\) \(;_;\)": "Sad or Crying",
    u"\(;_;": "Sad of Crying",
    u"\(;_:\)": "Sad or Crying",
    u"\(;O;\)": "Sad or Crying",
    u"\(:_;\)": "Sad or Crying",
    u"\(ToT\)": "Sad or Crying",
    u";_;": "Sad or Crying",
    u";-;": "Sad or Crying",
    u";n;": "Sad or Crying",
    u";;": "Sad or Crying",
    u"Q\.Q": "Sad or Crying",
    u"T\.T": "Sad or Crying",
    u"QQ": "Sad or Crying",
    u"Q_Q": "Sad or Crying",
    u"\(-\.-\)": "Shame",
    u"\(-_-\)": "Shame",
    u"\(一一\)": "Shame",
    u"\(；一_一\)": "Shame",
    u"\(=_=\)": "Tired",
    u"\(=\^\·\^=\)": "cat",
    u"\(=\^\·\·\^=\)": "cat",
    u"=_\^= ": "cat",
    u"\(\.\.\)": "Looking down",
    u"\(\._\.\)": "Looking down",
    u"\^m\^": "Giggling with hand covering mouth",
    u"\(\・\・?": "Confusion",
    u"\(?_?\)": "Confusion",
    u">\^_\^<": "Normal Laugh",
    u"<\^!\^>": "Normal Laugh",
    u"\^/\^": "Normal Laugh",
    u"\（\*\^_\^\*）": "Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)": "Normal Laugh",
    u"\(^\^\)": "Normal Laugh",
    u"\(\^\.\^\)": "Normal Laugh",
    u"\(\^_\^\.\)": "Normal Laugh",
    u"\(\^_\^\)": "Normal Laugh",
    u"\(\^\^\)": "Normal Laugh",
    u"\(\^J\^\)": "Normal Laugh",
    u"\(\*\^\.\^\*\)": "Normal Laugh",
    u"\(\^—\^\）": "Normal Laugh",
    u"\(#\^\.\^#\)": "Normal Laugh",
    u"\（\^—\^\）": "Waving",
    u"\(;_;\)/~~~": "Waving",
    u"\(\^\.\^\)/~~~": "Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~": "Waving",
    u"\(T_T\)/~~~": "Waving",
    u"\(ToT\)/~~~": "Waving",
    u"\(\*\^0\^\*\)": "Excited",
    u"\(\*_\*\)": "Amazed",
    u"\(\*_\*;": "Amazed",
    u"\(\+_\+\) \(@_@\)": "Amazed",
    u"\(\*\^\^\)v": "Laughing,Cheerful",
    u"\(\^_\^\)v": "Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)": "Headphones,Listening to music",
    u'\(-"-\)': "Worried",
    u"\(ーー;\)": "Worried",
    u"\(\^0_0\^\)": "Eyeglasses",
    u"\(\＾ｖ\＾\)": "Happy",
    u"\(\＾ｕ\＾\)": "Happy",
    u"\(\^\)o\(\^\)": "Happy",
    u"\(\^O\^\)": "Happy",
    u"\(\^o\^\)": "Happy",
    u"\)\^o\^\(": "Happy",
    u":O o_O": "Surprised",
    u"o_0": "Surprised",
    u"o\.O": "Surpised",
    u"\(o\.o\)": "Surprised",
    u"oO": "Surprised",
    u"\(\*￣m￣\)": "Dissatisfied",
    u"\(‘A`\)": "Snubbed or Deflated"
}

In [44]:
def clean_text(text):
    """
    Clean Text
    Preprocess the given text by removing noise, special characters, URLs, etc.

    Args:
        text (str): Input text to be cleaned.

    Returns:
        str: Cleaned and preprocessed text.
    """
    # Convert the text to lowercase
    text = text.lower()

    # Remove HTML entities and special characters
    text = re.sub(r'(&amp;|&lt;|&gt;|\n|\t)', ' ', text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # remove urls

    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove dates in various formats (e.g., DD-MM-YYYY, MM/DD/YY)
    text = re.sub(r'\d{1,2}(st|nd|rd|th)?[-./]\d{1,2}[-./]\d{2,4}', ' ', text)

    # Remove month-day-year patterns (e.g., Jan 1st, 2022)
    pattern = re.compile(r'(\d{1,2})?(st|nd|rd|th)?[-./,]?\s?(of)?\s?([J|j]an(uary)?|[F|f]eb(ruary)?|[Mm]ar(ch)?|[Aa]pr(il)?|[Mm]ay|[Jj]un(e)?|[Jj]ul(y)?|[Aa]ug(ust)?|[Ss]ep(tember)?|[Oo]ct(ober)?|[Nn]ov(ember)?|[Dd]ec(ember)?)\s?(\d{1,2})?(st|nd|rd|th)?\s?[-./,]?\s?(\d{2,4})?')
    text = pattern.sub(r' ', text)

    # Remove emoticons
    emoticons_pattern = re.compile(u'(' + u'|'.join(emo for emo in EMOTICONS) + u')')
    text = emoticons_pattern.sub(r' ', text)

    # Remove mentions (@) and hashtags (#)
    text = re.sub(r'(@\S+|#\S+)', ' ', text)

    # Fix contractions (e.g., "I'm" becomes "I am")
    text = contractions.fix(text)

    # Remove punctuation
    PUNCTUATIONS = string.punctuation
    text = text.translate(str.maketrans('', '', PUNCTUATIONS))

    # Remove unicode
    text = unidecode.unidecode(text)

    # Replace multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text

In [45]:
train['cleaned_text'] = train['text'].apply(lambda x: clean_text(x))

In [29]:
"""
# Cleaning the text
text_processor = TextPreProcessor(
    # terms that will be omitted
    omit=['url', 'email'],
    # terms that will be normalized
    normalize=['percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

# Cleaned Text
train['processed_text'] = train.text.apply(lambda x: " ".join(text_processor.pre_process_doc(x)))
test['processed_text'] = test.text.apply(lambda x: " ".join(text_processor.pre_process_doc(x)))

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, text_processor):
        self.feature_name = feature_name
        self.text_processor = text_processor
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['processed_' + self.feature_name] = X[self.feature_name].apply(lambda x: ' '.join(self.text_processor.pre_process_doc(x)))
        return X['processed_' + self.feature_name]
"""

'\n# Cleaning the text\ntext_processor = TextPreProcessor(\n    # terms that will be omitted\n    omit=[\'url\', \'email\'],\n    # terms that will be normalized\n    normalize=[\'percent\', \'money\', \'phone\', \'user\',\n        \'time\', \'date\', \'number\'],\n    # terms that will be annotated\n    annotate={"hashtag", "allcaps", "elongated", "repeated",\n        \'emphasis\', \'censored\'},\n    fix_html=True,  # fix HTML tokens\n\n    # corpus from which the word statistics are going to be used\n    # for word segmentation\n    segmenter="twitter",\n\n    # corpus from which the word statistics are going to be used\n    # for spell correction\n    corrector="twitter",\n\n    unpack_hashtags=True,  # perform word segmentation on hashtags\n    unpack_contractions=True,  # Unpack contractions (can\'t -> can not)\n    spell_correct_elong=False,  # spell correction for elongated words\n\n    # select a tokenizer. You can use SocialTokenizer, or pass your own\n    # the tokenizer, sh

## Converting text to vectors

There are many ways to convert text to vectors. We will use the TF-IDF methods.

In [46]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.75, ngram_range=(1,2))

## Perform Stratified K-Fold Cross-Validation

In [47]:
scv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [48]:
# ('cleaner', TextCleaner('text', text_processor)),
text_clf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', min_df=2, max_df=0.75, ngram_range=(1,2))),
    ('clf', None)
])

In [51]:
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deeds are the reason of this allah forgive us all
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,13000 people receive evacuation orders in california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,just got sent this photo from ruby as smoke from pours into a school


## Modeling

In [52]:
# Vanilla Models
models = {
    'logistic_regression': LogisticRegression(),
    'random_forest': RandomForestClassifier(),
    'xgboost': XGBClassifier(),
    'multinomial_nb': MultinomialNB(),
    'svm': SVC()
}

In [53]:
EXPERIMENT_NAME = "vanilla-model-custom-clean-text"
mlflow.set_experiment(EXPERIMENT_NAME)

2023/08/18 22:04:05 INFO mlflow.tracking.fluent: Experiment with name 'vanilla-model-custom-clean-text' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-zc-ta-dev-model-registry/5', creation_time=1692396245531, experiment_id='5', last_update_time=1692396245531, lifecycle_stage='active', name='vanilla-model-custom-clean-text', tags={}>

In [54]:
for model in models:
    # Cross-validation
    # clf = models[model]
    with mlflow.start_run():
        mlflow.set_tag('developer', 'sagar')
    
        mlflow.set_tag('model', model)
        text_clf_pipeline.set_params(clf=models[model])

        scores = cross_val_score(text_clf_pipeline, train['cleaned_text'], train.target, cv=scv, scoring='f1')
        mlflow.log_metric('f1', scores.mean())
        mlflow.log_metric('f1_std', scores.std())
        
        print(f'{model} F1: {scores.mean():.3f} +/- {scores.std():.3f}')

        # mlflow.sklearn.log_model(text_clf_pipeline, 'models')

logistic_regression F1: 0.728 +/- 0.017
random_forest F1: 0.713 +/- 0.008
xgboost F1: 0.673 +/- 0.017
multinomial_nb F1: 0.713 +/- 0.017
svm F1: 0.719 +/- 0.015


`Logistic Regression`, `Multinomial NB`, and `SVM` model performance were similar with `Logistic Regression` performing slightly better. Let's further tune these models to see if we can improve performance.

### Hyperparameter Tuning of Vanilla Models

In [55]:
params = {
    'logistic_regression': {
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'clf__penalty': ['l2', 'l1'],
        'clf__solver': ['sag', 'liblinear']
    },
    'multinomial_nb': {
        'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
    },
    'svm': {
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': ['scale', 'auto']
    }
}

In [56]:
EXPERIMENT_NAME = "hyperparameter-tuning-custom-cleaning-text"
mlflow.set_experiment(EXPERIMENT_NAME)

2023/08/18 22:05:56 INFO mlflow.tracking.fluent: Experiment with name 'hyperparameter-tuning-custom-cleaning-text' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-zc-ta-dev-model-registry/6', creation_time=1692396356354, experiment_id='6', last_update_time=1692396356354, lifecycle_stage='active', name='hyperparameter-tuning-custom-cleaning-text', tags={}>

In [57]:
n_iter_search = 10

for model in params:
    with mlflow.start_run():
        # clf = models[model]
        mlflow.set_tag('developer', 'sagar')
    
        mlflow.set_tag('model', model)
        text_clf_pipeline.set_params(clf=models[model])

        random_search = RandomizedSearchCV(text_clf_pipeline, param_distributions=params[model], n_iter=n_iter_search, cv=scv, scoring='f1', random_state=42)
        random_search.fit(train['cleaned_text'], train.target)

        mlflow.log_params(random_search.best_params_)
        # mlflow.sklearn.log_model(random_search.best_estimator_, 'models')

        print('Model: {}'.format(model))
        print('Best score: {}'.format(random_search.best_score_))
        print('Best params: {}'.format(random_search.best_params_))
        print('')


Model: logistic_regression
Best score: 0.7282459458129509
Best params: {'clf__solver': 'liblinear', 'clf__penalty': 'l2', 'clf__C': 1}

Model: multinomial_nb
Best score: 0.7167073760230425
Best params: {'clf__alpha': 0.1}

Model: svm
Best score: 0.7302802992549712
Best params: {'clf__kernel': 'linear', 'clf__gamma': 'scale', 'clf__C': 1}



The cross validation scores for the models are very similar. `SVM` has the highest score, but it is not significantly higher than the other models. For simplicity, speed, and better interpretability, I will use `MultiNomialNB` for the final model.

## Predict on the Test Set

In [58]:
test['cleaned_text'] = test.text.apply(lambda x: clean_text(x))

In [59]:
params = {
    'solver': 'liblinear', 
    'penalty': 'l2', 
    'C': 1.0
}

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', min_df=2, max_df=0.75, ngram_range=(1,2))),
    ('clf', LogisticRegression(**params))
])

In [60]:
pipeline.fit(train.cleaned_text, train.target)
y_pred = pipeline.predict(test.cleaned_text)

In [61]:
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [63]:
pipeline.predict(test.cleaned_text[:2])

array([1, 0])

In [64]:
with mlflow.start_run(run_name='logistic-regression-model-0.1'):
    mlflow.log_params(params)
    mlflow.set_tag('developer', 'sagar')

    mlflow.set_tag('model', 'LogisticRegression')
    mlflow.set_tag('status', 'final')

    mlflow.sklearn.log_model(pipeline, 'models')

In [68]:
loaded_pipeline = mlflow.pyfunc.load_model('s3://mlops-zc-sd-dev-model-registry/6/465a65643f584504a46364b45fec831d/artifacts/models')

In [67]:
test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text
0,0,,,Just happened a terrible car crash,just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",heard about is different cities stay safe everyone
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",there is a forest fire at spot pond geese are fleeing across the street i cannot save them all
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan


In [69]:
loaded_pipeline.predict(test.cleaned_text[:2])

array([1, 0])

In [37]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': y_pred
})

In [30]:
submission.to_csv('../data/submission.csv', index=False)

In [31]:
!kaggle competitions submit -c nlp-getting-started -f ../data/submission.csv -m "First submission - NB"

100%|██████████████████████████████████████| 22.2k/22.2k [00:00<00:00, 34.5kB/s]
Successfully submitted to Natural Language Processing with Disaster Tweets

Multinomial Naive Bayes Scores:
Text Processing Technique: Ekphrasis
- F1 Score on train set: 0.73764
- F1 Score on test set: 0.79834

Logistic Regression Scores:
Text Processing Technique: Custom
- F1 Score on train set: 0.72841
- F1 Score on test set: 0.79221

The model performance does not indicate overfitting. The F1 score on the test set is higher than the F1 score on the train set. The model is performing well on the test set.

## Transformer Models (Future Scope)

In [16]:
import torch

from tqdm import tqdm
from transformers import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss

In [17]:
num_classes = 2  # The number of classes in the dataset
model_name = "bert-base-uncased"  # Use the appropriate BERT model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Create a dataset from the pandas dataframe
train_df = Dataset.from_pandas(train[['processed_text', 'target']])

In [19]:
batch_size = 64

# Tokenize input text
encoded_data_train = tokenizer.batch_encode_plus(train_df['processed_text'], add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

# Create DataLoader
train_dataset = TensorDataset(encoded_data_train['input_ids'], encoded_data_train['attention_mask'], torch.tensor(train_df['target']))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [22]:
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = CrossEntropyLoss()

In [24]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'Loss': loss.item()}, refresh=True)
    progress_bar.close()



In [25]:
model.save_pretrained('./fine_tuned_model')

In [None]:
# Giscart
# Validation of ML models
# Product manager
# Try to find things not working properly
# giskard