# PREPROCESSING DATA

In [1]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce
with open('./dataset/stopwords.txt') as f:
    stopwords_list = []
    for row in f:
        stopwords_list.append(row.rstrip('\n'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tweet = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [3]:
# method
def remove_URL(text):
    # url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(r'http\S+','', text)

# remove emoji
def remove_emojis(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    text = re.sub(emoj, '', text)
    text = re.sub(r'(?::|;|=)(?:-)?(?:\)|\(|p|v|o|x|3|d|)', '', text)
    return text

def remove_stopwords(text):
    # stopwords_list = stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stopwords_list])

# Kí hiệu
def remove_invalid_char(text):
    return re.sub(r'[^a-zA-Z\s]',' ',text)

# Xoá khoảng trắng
def remove_leading_whitespace(text):
    return text.strip()

def to_lower(text):
    return text.lower()

# Tag 
def remove_mention(text):
    return re.sub(r'@\S+','',text)
    # return re.sub(r'@','',text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    for i in range(0,len(word_tokens)):
        word_tokens[i] = lemmatizer.lemmatize(word_tokens[i], 'v')
    return ' '.join([word for word in word_tokens])

In [4]:
mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not",
                "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will",
                "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
                "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
                "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
                "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us",
                "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
                "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 
                "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
                "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would",
                "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have",
                "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", 
                "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
                "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
                "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center',
                'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling',
                'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 
                'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 
                'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 
                'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I',
                'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate',
                "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist',
                'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend',
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_misspell(text):
    mispellings, mispellings_re = _get_mispell(mispell_dict)
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [5]:
# # Thanks to https://www.kaggle.com/rftexas/text-only-kfold-bert
# abbreviations = {,
#     "4ao" : "for adults only",
#     "a3" : "anytime anywhere anyplace",
#     "aamof" : "as a matter of fact",
#     "acct" : "account",
#     "adih" : "another day in hell",
#     "afaic" : "as far as i am concerned",
#     "afaict" : "as far as i can tell",
#     "afaik" : "as far as i know",
#     "afair" : "as far as i remember",
#     "afk" : "away from keyboard",
#     "app" : "application",
#     "approx" : "approximately",
#     "apps" : "applications",
#     "asap" : "as soon as possible",
#     "asl" : "age sex location",
#     "atk" : "at the keyboard",
#     "ave." : "avenue",
#     "aymm" : "are you my mother",
#     "ayor" : "at your own risk", 
#     "b&b" : "bed and breakfast",
#     "b+b" : "bed and breakfast",
#     "b.c" : "before christ",
#     "b2b" : "business to business",
#     "b2c" : "business to customer",
#     "b4" : "before",
#     "b4n" : "bye for now",
#     "b@u" : "back at you",
#     "bae" : "before anyone else",
#     "bak" : "back at keyboard",
#     "bbbg" : "bye bye be good",
#     "bbc" : "british broadcasting corporation",
#     "bbias" : "be back in a second",
#     "bbl" : "be back later",
#     "bbs" : "be back soon",
#     "be4" : "before",
#     "bfn" : "bye for now",
#     "blvd" : "boulevard",
#     "bout" : "about",
#     "brb" : "be right back",
#     "bros" : "brothers",
#     "brt" : "be right there",
#     "bsaaw" : "big smile and a wink",
#     "btw" : "by the way",
#     "bwl" : "bursting with laughter",
#     "c/o" : "care of",
#     "cet" : "central european time",
#     "cf" : "compare",
#     "cia" : "central intelligence agency",
#     "csl" : "can not stop laughing",
#     "cu" : "see you",
#     "cul8r" : "see you later",
#     "cv" : "curriculum vitae",
#     "cwot" : "complete waste of time",
#     "cya" : "see you",
#     "cyt" : "see you tomorrow",
#     "dae" : "does anyone else",
#     "dbmib" : "do not bother me i am busy",
#     "diy" : "do it yourself",
#     "dm" : "direct message",
#     "dwh" : "during work hours",
#     "e123" : "easy as one two three",
#     "eet" : "eastern european time",
#     "eg" : "example",
#     "embm" : "early morning business meeting",
#     "encl" : "enclosed",
#     "encl." : "enclosed",
#     "etc" : "and so on",
#     "faq" : "frequently asked questions",
#     "fawc" : "for anyone who cares",
#     "fb" : "facebook",
#     "fc" : "fingers crossed",
#     "fig" : "figure",
#     "fimh" : "forever in my heart", 
#     "ft." : "feet",
#     "ft" : "featuring",
#     "ftl" : "for the loss",
#     "ftw" : "for the win",
#     "fwiw" : "for what it is worth",
#     "fyi" : "for your information",
#     "g9" : "genius",
#     "gahoy" : "get a hold of yourself",
#     "gal" : "get a life",
#     "gcse" : "general certificate of secondary education",
#     "gfn" : "gone for now",
#     "gg" : "good game",
#     "gl" : "good luck",
#     "glhf" : "good luck have fun",
#     "gmt" : "greenwich mean time",
#     "gmta" : "great minds think alike",
#     "gn" : "good night",
#     "g.o.a.t" : "greatest of all time",
#     "goat" : "greatest of all time",
#     "goi" : "get over it",
#     "gps" : "global positioning system",
#     "gr8" : "great",
#     "gratz" : "congratulations",
#     "gyal" : "girl",
#     "hc" : "hot and cold",
#     "hp" : "horsepower",
#     "hr" : "hour",
#     "hrh" : "his royal highness",
#     "ht" : "height",
#     "ibrb" : "i will be right back",
#     "ic" : "i see",
#     "icq" : "i seek you",
#     "icymi" : "in case you missed it",
#     "idc" : "i do not care",
#     "idgadf" : "i do not give a damn fuck",
#     "idgaf" : "i do not give a fuck",
#     "idk" : "i do not know",
#     "ie" : "that is",
#     "i.e" : "that is",
#     "ifyp" : "i feel your pain",
#     "IG" : "instagram",
#     "iirc" : "if i remember correctly",
#     "ilu" : "i love you",
#     "ily" : "i love you",
#     "imho" : "in my humble opinion",
#     "imo" : "in my opinion",
#     "imu" : "i miss you",
#     "iow" : "in other words",
#     "irl" : "in real life",
#     "j4f" : "just for fun",
#     "jic" : "just in case",
#     "jk" : "just kidding",
#     "jsyk" : "just so you know",
#     "l8r" : "later",
#     "lb" : "pound",
#     "lbs" : "pounds",
#     "ldr" : "long distance relationship",
#     "lmao" : "laugh my ass off",
#     "lmfao" : "laugh my fucking ass off",
#     "lol" : "laughing out loud",
#     "ltd" : "limited",
#     "ltns" : "long time no see",
#     "m8" : "mate",
#     "mf" : "motherfucker",
#     "mfs" : "motherfuckers",
#     "mfw" : "my face when",
#     "mofo" : "motherfucker",
#     "mph" : "miles per hour",
#     "mr" : "mister",
#     "mrw" : "my reaction when",
#     "ms" : "miss",
#     "mte" : "my thoughts exactly",
#     "nagi" : "not a good idea",
#     "nbc" : "national broadcasting company",
#     "nbd" : "not big deal",
#     "nfs" : "not for sale",
#     "ngl" : "not going to lie",
#     "nhs" : "national health service",
#     "nrn" : "no reply necessary",
#     "nsfl" : "not safe for life",
#     "nsfw" : "not safe for work",
#     "nth" : "nice to have",
#     "nvr" : "never",
#     "nyc" : "new york city",
#     "oc" : "original content",
#     "og" : "original",
#     "ohp" : "overhead projector",
#     "oic" : "oh i see",
#     "omdb" : "over my dead body",
#     "omg" : "oh my god",
#     "omw" : "on my way",
#     "p.a" : "per annum",
#     "p.m" : "after midday",
#     "pm" : "prime minister",
#     "poc" : "people of color",
#     "pov" : "point of view",
#     "pp" : "pages",
#     "ppl" : "people",
#     "prw" : "parents are watching",
#     "ps" : "postscript",
#     "pt" : "point",
#     "ptb" : "please text back",
#     "pto" : "please turn over",
#     "qpsa" : "what happens", #"que pasa",
#     "ratchet" : "rude",
#     "rbtl" : "read between the lines",
#     "rlrt" : "real life retweet", 
#     "rofl" : "rolling on the floor laughing",
#     "roflol" : "rolling on the floor laughing out loud",
#     "rotflmao" : "rolling on the floor laughing my ass off",
#     "rt" : "retweet",
#     "ruok" : "are you ok",
#     "sfw" : "safe for work",
#     "sk8" : "skate",
#     "smh" : "shake my head",
#     "sq" : "square",
#     "srsly" : "seriously", 
#     "ssdd" : "same stuff different day",
#     "tbh" : "to be honest",
#     "tbs" : "tablespooful",
#     "tbsp" : "tablespooful",
#     "tfw" : "that feeling when",
#     "thks" : "thank you",
#     "tho" : "though",
#     "thx" : "thank you",
#     "tia" : "thanks in advance",
#     "til" : "today i learned",
#     "tl;dr" : "too long i did not read",
#     "tldr" : "too long i did not read",
#     "tmb" : "tweet me back",
#     "tntl" : "trying not to laugh",
#     "ttyl" : "talk to you later",
#     "u" : "you",
#     "u2" : "you too",
#     "u4e" : "yours for ever",
#     "utc" : "coordinated universal time",
#     "w/" : "with",
#     "w/o" : "without",
#     "w8" : "wait",
#     "wassup" : "what is up",
#     "wb" : "welcome back",
#     "wtf" : "what the fuck",
#     "wtg" : "way to go",
#     "wtpa" : "where the party at",
#     "wuf" : "where are you from",
#     "wuzup" : "what is up",
#     "wywh" : "wish you were here",
#     "yd" : "yard",
#     "ygtr" : "you got that right",
#     "ynk" : "you never know",
#     "zzz" : "sleeping bored and tired",
# }

# def convert_abbrev(word):
#     return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# def convert_abbrev_in_text(text):
#     tokens = word_tokenize(text)
#     tokens = [convert_abbrev(word) for word in tokens]
#     text = ' '.join(tokens)
#     return text

In [6]:
def preprocessing_csv(df):
    returned_df = df.copy()
    dict_func = [
                to_lower,
                replace_misspell,   
                remove_URL,
                remove_mention,
                remove_emojis,
                # invalid_char + whitspace = punct 
                remove_invalid_char,
                remove_leading_whitespace,
                # convert_abbrev_in_text, 
                remove_stopwords,
                lemmatize,
    ]
    for func in dict_func:
        returned_df['text'] = returned_df['text'].apply(lambda x: func(x))

    returned_df['keyword'] = returned_df['keyword'].str.replace('%20', ' ')
    
    return returned_df

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def group_similar_texts_with_same_loc(df, threshold):
    # Convert the text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["text"])

    # Calculate the pairwise cosine similarity between documents
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Group similar documents with the given threshold and keep the row with the highest frequency of target
    groups = {}
    for i in range(len(df)):
        group_id = None
        for j in range(i):
            if cosine_sim[i,j] > threshold and df['location'][i] == df['location'][j]:
                if group_id is None:
                    group_id = j
                elif df["target"][j] > df["target"][group_id]:
                    group_id = j
        if group_id is None:
            group_id = i
        groups.setdefault(group_id, []).append(i)

    # Create a new DataFrame with the grouped data and the most common target in each group
    grouped_data = []
    for group in groups.values():
        target_freq = df.loc[group]["target"].value_counts()
        most_common_target = target_freq.index[0]
        representative = df.loc[(df["target"] == most_common_target) & (df.index.isin(group))].iloc[0]        
        grouped_data.append({
            "id": representative["id"],
            "keyword": representative["keyword"],
            "location": representative["location"],
            "text": ", ".join(df.loc[group]["text"]),
            "target": most_common_target
        })
    grouped_df = pd.DataFrame(grouped_data)

    return grouped_df

In [8]:
df = preprocessing_csv(tweet)
train_df = group_similar_texts_with_same_loc(df, 0.9)
empty_text_rows = train_df[train_df['text'] == '']
train_df = train_df.drop(empty_text_rows.index)
train_df['text'] = train_df['text'].replace('','none')
train_df.to_csv('./preprocessing/train.csv', index = False)

In [9]:
df_test = preprocessing_csv(test)
df_test['text'] = df_test['text'].replace('','none')
df_test.to_csv('./preprocessing/test.csv', index = False)