**1. Data analyze and cleaning**

Loading data, clearing text from noise, lemmatization

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
from tqdm import tqdm

In [3]:
df=pd.read_csv('data.csv')
display(df.head(5))
print(df.shape)
print(df['relevant'].value_counts())

Unnamed: 0,text,relevant
0,"–ï–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω—ã–π –≤ —Å—Ç—Ä–∞–Ω–µ, –Ω–∏—Å–∫–æ–ª—å–∫–æ –Ω–µ —Å–æ–º–Ω–µ–≤–∞—é—Å—å...",1
1,–ó–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–∞—è –∂–µ–Ω—â–∏–Ω–∞ –∏ –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å!üå∫,1
2,"–ü–µ—Ä–µ—Å–¥–∞—á–∞,–∫—É—Ä—Å–æ–≤–∞—è ,—É–¥–∞—á–∞ üòÅ–Ω–µ –≤—Å–µ —Ç–∞–∫ –ø–ª–æ—Ö–æ –≤...",1
3,–î–ª—è –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã—Ö –∞–±–∏—Ç—É—Ä–∏–µ–Ω—Ç–æ–≤ (—Å—Ç—É–¥–µ–Ω—Ç–æ–≤) –ø–æ–ª...,1
4,"–†–µ–π—Ç–∏–Ω–≥ –í—É–∑–∞ —ç—Ç–æ –∫–æ–Ω–µ—á–Ω–æ –∑–¥–æ—Ä–æ–≤–æ, –Ω–æ 5 –±—é–¥–∂–µ—Ç–Ω...",1


(43553, 2)
relevant
1    31177
0    12376
Name: count, dtype: int64


In [None]:
# Stop-words
russian_stopwords = set(stopwords.words("russian"))

# Lemmatizator
nlp = spacy.load("ru_core_news_sm")

# –†–µ–≥—É–ª—è—Ä–Ω—ã–µ –≤—ã—Ä–∞–∂–µ–Ω–∏—è
RE_HTML = re.compile(r'<.*?>')
RE_EMOJI = re.compile(r'[\U00010000-\U0001FFFF]', flags=re.UNICODE)

# Clean funtion
def clean_text(text):
    text = str(text).lower() # Lowercase
    text = RE_HTML.sub('', text) # HTML
    text = RE_EMOJI.sub('', text) # Emoji
    text = re.sub(r'[^\w\s]', ' ', text) # Punctuation marks
    text = re.sub(r'\d+', '', text) # Numbers
    text = re.sub(r'\s+', ' ', text).strip() # Extra spaces
    
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc]
    tokens = [t for t in tokens if t and t not in russian_stopwords] # if t remove empty tokens
    return ' '.join(tokens)


In [None]:
# Cleaning
tqdm.pandas()
df['clean'] = df['text'].progress_apply(clean_text)
display(df.head(5))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43553/43553 [07:13<00:00, 100.44it/s]


Unnamed: 0,text,relevant,clean
0,"–ï–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω—ã–π –≤ —Å—Ç—Ä–∞–Ω–µ, –Ω–∏—Å–∫–æ–ª—å–∫–æ –Ω–µ —Å–æ–º–Ω–µ–≤–∞—é—Å—å...",1,–µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω—ã–π —Å—Ç—Ä–∞–Ω–∞ –Ω–∏—Å–∫–æ–ª—å–∫–æ —Å–æ–º–Ω–µ–≤–∞—Ç—å—Å—è —ç—Ç–æ ...
1,–ó–∞–º–µ—á–∞—Ç–µ–ª—å–Ω–∞—è –∂–µ–Ω—â–∏–Ω–∞ –∏ –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å!üå∫,1,–∑–∞–º–µ—á–∞—Ç–µ–ª—å–Ω—ã–π –∂–µ–Ω—â–∏–Ω–∞ –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—å
2,"–ü–µ—Ä–µ—Å–¥–∞—á–∞,–∫—É—Ä—Å–æ–≤–∞—è ,—É–¥–∞—á–∞ üòÅ–Ω–µ –≤—Å–µ —Ç–∞–∫ –ø–ª–æ—Ö–æ –≤...",1,–ø–µ—Ä–µ—Å–¥–∞—á–∞ –∫—É—Ä—Å–æ–≤–æ–π —É–¥–∞—á–∞ –ø–ª–æ—Ö–æ–π –≤—Ä–æ–¥–µ
3,–î–ª—è –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã—Ö –∞–±–∏—Ç—É—Ä–∏–µ–Ω—Ç–æ–≤ (—Å—Ç—É–¥–µ–Ω—Ç–æ–≤) –ø–æ–ª...,1,–ø–æ—Ç–µ–Ω—Ü–∏–∞–ª—å–Ω—ã–π –∞–±–∏—Ç—É—Ä–∏–µ–Ω—Ç —Å—Ç—É–¥–µ–Ω—Ç –ø–æ–ª—É—á–∏—Ç—å –≤–µ—Å—å...
4,"–†–µ–π—Ç–∏–Ω–≥ –í—É–∑–∞ —ç—Ç–æ –∫–æ–Ω–µ—á–Ω–æ –∑–¥–æ—Ä–æ–≤–æ, –Ω–æ 5 –±—é–¥–∂–µ—Ç–Ω...",1,—Ä–µ–π—Ç–∏–Ω–≥ –≤—É–∑ —ç—Ç–æ –∑–¥–æ—Ä–æ–≤—ã–π –±—é–¥–∂–µ—Ç–Ω—ã–π –º–µ—Å—Ç–æ –ø–æ—Å—Ç—É...


In [None]:
# Check
print(df['clean'].isna().sum())

0


In [None]:
# Save
df.to_csv("clean_comments.csv", index=False)