In [3]:
from google.colab import files
from IPython.display import clear_output

files.upload()
clear_output()

In [4]:
!pip install -U blackcellmagic
clear_output()

In [5]:
%load_ext blackcellmagic

In [6]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [7]:
!chmod 600 kaggle.json
!cp kaggle.json ~/.kaggle/

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
!kaggle datasets download -d blackmoon/russian-language-toxic-comments
!kaggle datasets download -d alexandersemiletov/toxic-russian-comments

Downloading russian-language-toxic-comments.zip to /content
  0% 0.00/1.49M [00:00<?, ?B/s]
100% 1.49M/1.49M [00:00<00:00, 108MB/s]
Downloading toxic-russian-comments.zip to /content
 42% 5.00M/12.0M [00:00<00:00, 34.3MB/s]
100% 12.0M/12.0M [00:00<00:00, 58.8MB/s]


In [10]:
!unzip russian-language-toxic-comments.zip
!rm -rf russian-language-toxic-comments.zip
!unzip toxic-russian-comments.zip
!rm -rf toxic-russian-comments.zip

Archive:  russian-language-toxic-comments.zip
  inflating: labeled.csv             
Archive:  toxic-russian-comments.zip
  inflating: dataset.txt             


In [11]:
df_list = []

with open("dataset.txt") as f:
    for line in f:
        label, text = line.split()[0], ' '.join(line.split()[1:])
        if label == "__label__NORMAL":
            mask = 0.0
        else:
            mask = 1.0
        df_list.append((text, mask))

In [12]:
df_1 = pd.read_csv("labeled.csv")
df_2 = pd.DataFrame(df_list, columns=["comment", "toxic"])

df = pd.concat([df_1, df_2])

In [13]:
df.head()

Unnamed: 0,comment,toxic
0,"–í–µ—Ä–±–ª—é–¥–æ–≤-—Ç–æ –∑–∞ —á—Ç–æ? –î–µ–±–∏–ª—ã, –±–ª...\n",1.0
1,"–•–æ—Ö–ª—ã, —ç—Ç–æ –æ—Ç–¥—É—à–∏–Ω–∞ –∑–∞—Ç—é–∫–∞–Ω–æ–≥–æ —Ä–æ—Å—Å–∏—è–Ω–∏–Ω–∞, –º–æ–ª...",1.0
2,–°–æ–±–∞–∫–µ - —Å–æ–±–∞—á—å—è —Å–º–µ—Ä—Ç—å\n,1.0
3,"–°—Ç—Ä–∞–Ω–∏—Ü—É –æ–±–Ω–æ–≤–∏, –¥–µ–±–∏–ª. –≠—Ç–æ —Ç–æ–∂–µ –Ω–µ –æ—Å–∫–æ—Ä–±–ª–µ–Ω–∏...",1.0
4,"—Ç–µ–±—è –Ω–µ —É–±–µ–¥–∏–ª 6-—Å—Ç—Ä–∞–Ω–∏—á–Ω—ã–π –ø–¥—Ñ –≤ —Ç–æ–º, —á—Ç–æ –°–∫—Ä...",1.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262702 entries, 0 to 248289
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   comment  262702 non-null  object 
 1   toxic    262702 non-null  float64
dtypes: float64(1), object(1)
memory usage: 6.0+ MB


In [15]:
df["toxic"].value_counts()

0.0    213271
1.0     49431
Name: toxic, dtype: int64

In [16]:
df.to_csv("toxic_comments.csv")

In [15]:
files.download("toxic_comments.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
df_toxic = df[df["toxic"] == 1.0]
df_nontoxic = df[df["toxic"] == 0.0]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    df_toxic["comment"], df_toxic["toxic"], random_state=42
    )

In [19]:
df_train = pd.concat(
    [pd.DataFrame.from_dict({"comment": X_train, "toxic": y_train}),
     df_nontoxic]
    ).sample(frac=1).reset_index(drop=True)

In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250344 entries, 0 to 250343
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   comment  250344 non-null  object 
 1   toxic    250344 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.8+ MB


In [21]:
df_train.head()

Unnamed: 0,comment,toxic
0,600 –ª–µ–π –Ω–∞ —ç—Ç–æ,0.0
1,–º–æ–ª–æ–¥—Ü—ã —Ç—Ä—É–¥—è–≥–∏,0.0
2,10 –∫–æ–ø–µ–µ–∫ —Å—Ç–æ–∏–ª–∞ –∫–∞—Ä—Ç–æ—à–∫–∞ –≤ —Å—Ç–æ–ª–æ–≤–æ–π! –∞ –æ–≤–æ—â–∏ ...,0.0
3,–±–ª—è–¥—å –ø—Ä–∏–¥—É–º–∞–ª –æ–Ω!,1.0
4,"–∞ —Å–∫–æ–ª—å–∫–æ —Å—Ç–æ–∏—Ç –∫–≤ –º–µ—Ç—Ä, —Ç–∞–∫ –∂–µ —Ä–∞—Å—Å—á–∏—Ç—ã–≤–∞—é—Ç —Å...",0.0


In [22]:
df_train["toxic"].value_counts()

0.0    213271
1.0     37073
Name: toxic, dtype: int64

In [23]:
df_train.to_csv("train.csv")

In [24]:
len(X_test)

12358

In [31]:
with open('test.txt', 'w') as test:
    for comment in X_test:
        test.write(comment.strip('\n') + '\n')

In [32]:
!head test.txt

–≤–æ—Ç —ç—Ç–æ —Ç–æ—á—å–Ω–æ –ø–∏–¥–æ—Ä–∞—Å—ã
—Å–∫–∞–ª—å–ø—ã –º–µ—Å—Ç–Ω–æ–≥–æ –Ω–∞—Å–µ–ª–µ–Ω–∏—è –∑–∞ –¥–µ–Ω—å–≥–∏ –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª–∏, –Ω–µ —Ä–µ–∑–∞–ª–∏ —Ä—É–∫–∏ –¥–µ—Ç—è–º –∑–∞ –æ—Ç–∫–∞–∑ –∏—Ö —Ä–æ–¥–∏—Ç–µ–ª–µ–π —Ä–∞–±–æ—Ç–∞—Ç—å –∏ –Ω–µ–≤—ã–ø–æ–ª–Ω–µ–Ω–∏–µ –ø–ª–∞–Ω–∞, –ö–∞–∫–æ–π –ø–æ—Ç—Ä—è—Å–∞—é—â–µ —É–¥–æ–±–Ω—ã–π –ø–æ–¥—Ö–æ–¥. –ê –µ—â–µ –Ω–µ –Ω–∞–±–∏–≤–∞–ª–∏ —Å–æ–ª–æ–º–æ–π —Ç—Ä—É–ø—ã –≤—Ä–∞–≥–æ–≤ –∏ –Ω–µ –Ω–∞—Å–∏–ª–æ–≤–∞–ª–∏ –ø—Ä–µ–¥–º–µ—Ç—ã –¥–æ–º–∞—à–Ω–µ–≥–æ –æ–±–∏—Ö–æ–¥–∞ —Ç—É–∑–µ–º—Ü–µ–≤. –ß—Ç–æ –Ω–µ –¥–æ–±–∞–≤–∏–ª–∏ –∏ —ç—Ç–∏ –¥–≤–∞ –ø—É–Ω–∫—Ç–∞? –Ø –ø—Ä–∏–≤–µ–ª —Ç–µ–±–µ —Ñ–∞–∫—Ç—ã –º–Ω–æ–≥–æ—á–∏—Å–ª–µ–Ω–Ω—ã—Ö —É–±–∏–π—Å—Ç–≤ –∞–±–æ—Ä–∏–≥–µ–Ω–æ–≤ —Ä–∞–¥–∏ –∑–∞—Ö–≤–∞—Ç–∞ –∏—Ö –∑–µ–º–µ–ª—å –∏–ª–∏ –ø–æ–¥—á–∏–Ω–µ–Ω–∏—è –∏—Ö –Ω–∞—Ä–æ–¥–æ–≤, –∞ —Ç—ã –ª–æ–≤–∫–æ –≤—ã–∫—Ä—É—á–∏–≤–∞–µ—à—å—Å—è —Ç–µ–º, —á—Ç–æ –≤—ã—Ä–µ–∑–∞–ª–∏-—Ç–æ –Ω–µ –≤—Å–µ—Ö –ø–æ–≥–æ–ª–æ–≤–Ω–æ, –∞ –≤—Å–µ–≥–æ –ª–∏—à—å –Ω–µ–ø–æ–∫–æ—Ä–Ω—ã–µ –∞—É–ª—ã, –∫–æ–Ω—Ü–ª–∞–≥–µ—Ä–∏ —Å—Ç—Ä–æ–∏–ª–∏ –Ω–µ –¥–ª—è –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã—Ö –Ω–

In [34]:
files.download("train.csv")
files.download("test.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>