In [12]:
import json
import re
import requests
import emoji

import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing  import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score
from IPython.core.display import HTML, display

  from IPython.core.display import HTML, display


In [3]:
train_df = pd.read_csv("train.csv")
train_df['title'] = train_df['title'].fillna('N')


train_df.head()

Unnamed: 0,ID,url,title,label
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0


In [105]:
test_df = pd.read_csv("test.csv")
test_df['title'] = test_df['title'].fillna('N')
test_df.head()

Unnamed: 0,ID,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67


In [106]:
from sklearn.metrics import f1_score

In [107]:
baseline_features = ['url', 'title']

In [170]:
X_train = (train_df[baseline_features].values)
X_test = test_df[baseline_features].values
y_train = train_df["label"].values

print(type(X_train))

from nltk.corpus import stopwords
stop_words =stopwords.words('russian')
stop_words.extend(list(stopwords.words('english')))

for i  in range (len(X_train)) :
    X_train[i][1] = emoji.demojize((X_train[i][1]), language='ru')
    X_train[i][1] = [i.lower() for i in X_train[i][1].split() if i not in stop_words and not i.isdigit()]

print(X_train[1])

for i  in range (len(X_test)) :
    X_test[i][1] = [i.lower() for i in X_test[i][1].split() if i not in stop_words]



<class 'numpy.ndarray'>
['www.kp.by'
 list(['эта', 'песня', 'стала', 'известна', 'многим', 'телезрителям', 'благодаря', 'сериалу', 'диверсант-2'])]


In [150]:

checked_titles = X_train[y_train==1]
words_counter = Counter([re.sub(r'[^\w\s]', '', token) for doc in checked_titles for token in doc[1] ])
words_title = [i[0] for i in words_counter.most_common(100)]
words_title[1:16]

['порно',
 'porn',
 'видео',
 'sex',
 'videos',
 'онлайн',
 'video',
 'hd',
 'смотреть',
 'xxx',
 'free',
 'секс',
 'бесплатно',
 'фото',
 'скачать']

## Модель на словах. CountVectorizer

In [171]:
SEED = 42


count_model = Pipeline([
    (
        'vectorizer',
        CountVectorizer(
            lowercase=True, ngram_range=(1, 1),
            stop_words=stop_words, min_df=3, max_df=0.8
        )
    ),
    ('clf', SGDClassifier(random_state=SEED, loss='log', class_weight='balanced'))
])

In [169]:
vectorizer = count_model.steps[0][1]
features = np.array(vectorizer.get_feature_names_out())

vectorizer

In [172]:
count_model.fit([str(text) for text in X_train], y_train)



## Дерево


In [232]:
tree_model = Pipeline([
    (
        'vectorizer',
        CountVectorizer(
            lowercase=True, ngram_range=(1, 1),
            stop_words=stop_words, min_df=3, max_df=0.8
        )
    ),
    ('clf', DecisionTreeClassifier(random_state=SEED, criterion='entropy', max_depth=10))
])

In [233]:
tree_model.fit([str(text) for text in X_train], y_train)

## Модель на n-gram'ах символов. TfidfVectorizer

In [234]:
def preprocessor(text):
    whitespaced_text = re.sub("[^а-яё!:)(]", ' ', text.lower())
    return re.sub(' +', ' ',  whitespaced_text)

In [235]:
char_tfidf_model = Pipeline([
    (
        'vectorizer',
        TfidfVectorizer(
            lowercase=True, ngram_range=(2, 4), analyzer='char',
            preprocessor=preprocessor, min_df=5, max_df=0.8
        )
    ),
    ('clf', SGDClassifier(random_state=SEED, loss='log', class_weight='balanced'))
])

In [236]:
char_tfidf_model.fit([str(text) for text in X_train], y_train)



## Подбор параметров с помощью кросс-валидации

In [237]:
parameters = {
    'vectorizer__max_df': (0.5, 0.75),
    'vectorizer__min_df': (3, 5, 7),
    'clf__alpha': (0.0001, 0.001, 0.01),
}

In [238]:
grid_search = GridSearchCV(count_model, parameters, cv=3, n_jobs=-1, scoring='f1', verbose=1)

In [239]:
grid_search.fit([str(text) for text in X_train], y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits




In [240]:
cv_tuned_pipeline = grid_search.best_estimator_

In [173]:
# print(f'Кросс-валидация: {f1_score(y_train,cv_tuned_pipeline.predict([str(text) for text in X_train]))}')
# print(f'Модель на n-gram-ах cимволов: {f1_score(y_train, char_tfidf_model.predict([str(text) for text in X_train]))}')
# print(f"Дерево: {f1_score(y_train,tree_model.predict([str(text) for text in X_train]))}")
print(f'Модель на словах: {f1_score(y_train, count_model.predict([str(text) for text in X_train]))}')

Модель на словах: 0.9705201969274573


## Submit

In [242]:
# используем count_model
result_df = pd.DataFrame(columns=['ID','label'])

result_df["label"] = count_model.predict([str(text) for text in X_test])
result_df['ID'] =test_df['ID'].values

result_df[["ID", "label"]].to_csv("result.csv", index=False)


In [None]:
# result1_df = pd.DataFrame(columns=['ID','label'])

# result1_df["label"] = cv_tuned_pipeline.predict([str(text) for text in X_test])
# result1_df['ID'] =test_df['ID'].values

# result1_df[["ID", "label"]].to_csv("result1.csv", index=False)
