In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
output_dir ='/kaggle/working/'

/kaggle/input/dl-kaggle-dataset/cleaned_train_x.csv
/kaggle/input/dl-kaggle-dataset/cleaned_val_x.csv
/kaggle/input/dl-kaggle-dataset/cleaned_test_x.csv
/kaggle/input/dl-kaggle-dataset/train_y.csv
/kaggle/input/dl-kaggle-dataset/train_x.csv
/kaggle/input/dl-kaggle-dataset/test_x.csv
/kaggle/input/dl-kaggle-dataset/glove.840B.300d.txt
/kaggle/input/dl-kaggle-dataset/val_x.csv
/kaggle/input/dl-kaggle-dataset/val_y.csv
/kaggle/input/dl-kaggle-dataset/cleanwords.txt
/kaggle/input/dl-kaggle-dataset/crawl-300d-2M.vec


In [3]:
import re
from collections import defaultdict
import unicodedata
import emoji
from unidecode import unidecode

In [4]:
train_x = pd.read_csv(os.path.join(dirname, 'train_x.csv'))
val_x = pd.read_csv(os.path.join(dirname, 'val_x.csv'))
test_x = pd.read_csv(os.path.join(dirname, 'test_x.csv'))

In [5]:
cl_path = (os.path.join(dirname, 'cleanwords.txt'))
clean_word_dict = {}
with open (cl_path, 'r', encoding = 'utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

# Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^?!.,:a-z\d ]',re.IGNORECASE)

# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
word_count_dict = defaultdict(int)
toxic_dict = {}

def clean_text(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    text = re.sub(r"[“”—’…’‘˚]«»▄·ˈ", "", text)
    
    # Normalize unicode
    text = unicodedata.normalize('NFKC', text)
    # remove all the emojis
    text = emoji.demojize(text)
    text = re.sub(r':[a-z_]+:', ' ', text)
    # remove all the tones
    text = unidecode(text)
    
    if clean_wiki_tokens:
        # Drop the image
        text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text)

        # Drop css
        text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ",text)
        text = re.sub(r"\{\|[^\}]*\|\}", " ", text)

        # Clean templates
        text = re.sub(r"\[?\[user:.*\]", " ", text)
        text = re.sub(r"\[?\[user:.*\|", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
        text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
        text = re.sub(r"\[?\[special:.*\]", " ", text)
        text = re.sub(r"\[?\[special:.*\|", " ", text)
        text = re.sub(r"\[?\[category:.*\]", " ", text)
        text = re.sub(r"\[?\[category:.*\|", " ", text)

    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"what’s", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\’s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\’ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"can’t", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"n’t", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\’re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\’d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\’ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"\!", " ! ", text)
    text = re.sub(r"\"", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
#     text = re.sub(r"mslgbt", "lgbt", text)
    text = replace_numbers.sub(' ', text)
    
    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
    
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    # replace with pattern
    # replace text starting with lg or contains lgbt with lgbt
#     pattern = r'\blg\w*|\blgbt\b'
#     text = re.sub(pattern, 'lgbt', text)

    return (text)

In [6]:
print('Processing text dataset')
list_sentences_train = train_x["string"].fillna("no comment").values
list_sentences_test = test_x["string"].fillna("no comment").values
list_sentences_val = val_x["string"].fillna("no comment").values

train_comments = [clean_text(text) for text in list_sentences_train]
test_comments = [clean_text(text) for text in list_sentences_test]
val_comments = [clean_text(text) for text in list_sentences_val]

print("Cleaned.")

Processing text dataset
Cleaned.


In [7]:
train_x['string'] = train_comments
test_x['string'] = test_comments
val_x['string'] = val_comments
output_path_train = os.path.join(output_dir, 'cleaned_train_x.csv')
train_x.to_csv(output_path_train, index=False)
output_path_test = os.path.join(output_dir, 'cleaned_test_x.csv')
test_x.to_csv(output_path_test, index=False)
output_path_val = os.path.join(output_dir, 'cleaned_val_x.csv')
val_x.to_csv(output_path_val, index=False)