In [None]:
import pandas as pd
import html
import re
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [None]:
!mkdir ./data
!wget https://www.dropbox.com/s/mp9d95ti5dfwxlt/train.csv?dl=0 -O ./data/train.csv
!wget https://www.dropbox.com/s/a47s1tofl8q6by4/test.csv?dl=0 -O ./data/test.csv

--2021-06-01 15:02:36--  https://www.dropbox.com/s/mp9d95ti5dfwxlt/train.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/mp9d95ti5dfwxlt/train.csv [following]
--2021-06-01 15:02:36--  https://www.dropbox.com/s/raw/mp9d95ti5dfwxlt/train.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3fb97f570e4ae63dbec6506793.dl.dropboxusercontent.com/cd/0/inline/BPk-GGHTVnVbb8tbbA3Yi57M7tp0b53OSXstW9gnrD8xJhrOZYpqac6G8NZLx5yvudcBrvAJYaj_e26vVFzBkl1Ze1j78MmsfNzW6CygxfoyNIpy-66xmOStAswVdgzsecguFsE7tqYej_Tmdl24FVkd/file# [following]
--2021-06-01 15:02:36--  https://uc3fb97f570e4ae63dbec6506793.dl.dropboxusercontent.com/cd/0/inline/BPk-GGHTVnVbb8tbbA3Yi57M7tp0b53OSXstW9gnrD8xJhrOZYpqac6G8NZLx5yvudcBrvAJYaj_e26vVFzBkl1Ze1j78M

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
train[train.duplicated(keep=False)].sample(5)

Unnamed: 0,sentence,language
3024980,гэта што калі мы не яны як але ён каб дзякуй б...,ru
1249471,the and of it that you to apos we this is in t...,ta
108013,гэта што калі мы не яны як але ён каб дзякуй б...,ru
1483794,що це ми не як на вони та але ви до про він оп...,et
688275,гэта што калі мы не яны як але ён каб дзякуй б...,ru


In [None]:
# Очистим данные от дубликатов
train_clear = train[~train.duplicated(keep=False)]

X, y = train_clear.iloc[:, 0], train_clear.iloc[:, 1]

In [None]:
# Убираем html-мнемоники и лишние символы перед векторизацией
def ngrams_prep(text):
  return re.sub(r'[\W\d]', '', html.unescape(text.lower()))

def words_prep(text):
  return re.sub(r'\d', '', html.unescape(text.lower()))

In [None]:
# Будем строить модель на униграммах, биграммах и словах с tfidf-векторизацией 
model = Pipeline([('transform', ColumnTransformer([('n-grams',
                                                    TfidfVectorizer(preprocessor=ngrams_prep,
                                                                    analyzer='char', ngram_range=(1, 2),
                                                                    min_df=1000),
                                                   'sentence'),
                                                  ('words',
                                                   TfidfVectorizer(preprocessor=words_prep,
                                                                   analyzer='word',
                                                                   min_df=100),
                                                   'sentence')])),
                 ('clf', SGDClassifier(alpha=1e-06))])

In [None]:
# Тест на отложеннной выборке
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42)

model.fit(pd.DataFrame(X_train), y_train)
predict = model.predict(pd.DataFrame(X_test))

balanced_accuracy_score(y_test, predict)

0.9018664733134661

In [None]:
# Итоговое предсказание
model.fit(pd.DataFrame(X), y)
submission = model.predict(pd.DataFrame(test.iloc[:, 1]))

submission_ = pd.concat([test.iloc[:, 0], pd.Series(submission, name='language')],
                        axis=1)
submission_.to_csv('submission.csv', index=False)