# Data Cleaning

Прежде чем приступать к моделированию, твиты необходимо почистить. Произведем базовую очистку такую как коррекция слов с ошибками, удаление пунктуации, удаление html тегов и эмодзи.

In [1]:
import re
import string
import numpy as np
import pandas as pd
from spellchecker import SpellChecker

**Загружаем данные**

In [2]:
train_data = pd.read_csv("data/train.csv", index_col="id")
test_data = pd.read_csv("data/test.csv", index_col="id")
print(f"Shape of train data: {train_data.shape}. Shape of test data: {test_data.shape}")
train_size = train_data.shape[0]

Shape of train data: (7613, 4). Shape of test data: (3263, 3)


In [3]:
# df = pd.concat([train_data, test_data], sort=False)
# df.shape

data = pd.concat([train_data.drop(['target'], axis=1), test_data])
y = train_data['target']

**Удаляем urls**

In [4]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

'New competition launched :'

In [5]:
data['text'] = data['text'].apply(lambda x : remove_URL(x))

**Удаляем html теги**

In [6]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [7]:
data['text']=data['text'].apply(lambda x : remove_html(x))

**Удаляем эмодзи**

In [8]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [9]:
data['text']=data['text'].apply(lambda x: remove_emoji(x))

**Удаляем пунктуацию**

In [10]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

I am a king


**Исправляем слова с ошибками**

In [11]:
# pip install pyspellchecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

'correct me please'

In [12]:
#df['text'] = df['text'].apply(lambda x : correct_spellings(x))

In [13]:
cleaned_train_data = data.iloc[:train_size, :]
cleaned_test_data = data.iloc[train_size:, :]

cleaned_train_data['target'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
cleaned_train_data.to_csv('data/cleaned_train_data.csv', header=True)
cleaned_test_data.to_csv('data/cleaned_test_data.csv', header=True)