### Датасет для классификации (бинарная) текстов.

** https://www.kaggle.com/areeves87/rscience-popular-comment-removal **

In [1]:
import pandas as pd

df_train = pd.read_csv('reddit_train.csv', encoding = 'latin-1')
df_train.drop(columns = ['Unnamed: 0', 'X'], inplace = True)
df_train.head(10)

Unnamed: 0,BODY,REMOVED
0,Always be wary of news articles that cite unpu...,0
1,The problem I have with this is that the artic...,0
2,"This is indicative of a typical power law, and...",0
3,This doesn't make sense. Chess obviously trans...,0
4,1. I dispute that gene engineering is burdenso...,0
5,"Very misleading headline. Using CRISPR, it wa...",0
6,At least we are getting pretty smart at detect...,1
7,"Democracy is the worst kind of government, exc...",0
8,Journal Reference:- [cell.com] (http://dx.doi....,0
9,There has been some interesting research on ca...,0


In [2]:
df_train.groupby("REMOVED").count()

Unnamed: 0_level_0,BODY
REMOVED,Unnamed: 1_level_1
0,14479
1,6857


In [3]:
df_test = pd.read_csv('reddit_test.csv', encoding = 'latin-1')
df_test.drop(columns = ['Unnamed: 0', 'X'], inplace = True)
df_test.head()

Unnamed: 0,BODY,REMOVED
0,Is there any veracity to the claims that peopl...,0
1,"Welcome to the 1980's, Head of NASA",1
2,Hold on while I stroke my ego,1
3,I wonder if this is associated with majority o...,0
4,"So, just like everyone, everywhere, then?",1


In [4]:
df_test.groupby("REMOVED").count()

Unnamed: 0_level_0,BODY
REMOVED,Unnamed: 1_level_1
0,4812
1,2299


### Выводим в цикле целиком текст первых нескольких записей.

In [5]:
for i in df_train.BODY[:21]:
    print("______")
    print("      ")
    print([i])

______
      
["Always be wary of news articles that cite unpublished studies. Even if they are eventually published as claimed, it's not the responsible way to report on science.\r\n\r\nIt could be an absolutely shit study, but once the claims are made in public you can't unring the bell. At least if the study is reported on *when* it is published, one can challenge it at the same time. "]
______
      
['The problem I have with this is that the article appears to credit the plain packets with the decline in smoking. \r\n\r\n"Between 2010 and 2013, the proportion of daily smokers in Australia dropped from 15.1 to 12.8 per cent - a record decline." In other words, a drop of 2.3 percent (15.1 - 12.8). \r\n\r\nYet in the United States - which does *not* have plain packaging - the rate dropped from 19.3 (in 2010) to 17.8 (in 2013). That\'s 1.5 percent (19.3 - 17.8). Granted, that\'s not as much as Australia\'s decline but it indicates that you cannot credit Australia\'s entire decline to 

### Находим малоинформативные повторяющиеся участки в текстах

In [6]:
def clean_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    
    df[text_field] = df[text_field].str.replace(r"(http|https)://[^\s]*", " httpaddr ")
    df[text_field] = df[text_field].str.replace(r"\b[^\s]+@[^\s]+[.][^\s]+\b", " emailaddr ")
    df[text_field] = df[text_field].str.replace(r"\b[\d.]+\b", " number ")
    df[text_field] = df[text_field].str.replace(r"[$]", " dollar ")
    df[text_field] = df[text_field].str.replace(r"[!]", " exclammark ")
    df[text_field] = df[text_field].str.replace(r"[?]", " questmark ")
    
    # Remove all other punctuation (replace with white space)
    df[text_field] = df[text_field].str.replace(r"([^\w\s]+)|([_-]+)", " ")
    
    # Remove all other punctuation (replace with white space)
    df[text_field] = df[text_field].str.replace(r"\n", " newline ")
    df[text_field] = df[text_field].str.replace(r"\n\n", " blankline ")
    
    # Make all white space a single space
    df[text_field] = df[text_field].str.replace(r"\s+", " ")

    df[text_field] = df[text_field].str.lower()
 

    return df

In [7]:
df_train = clean_text(df_train, "BODY")
df_test = clean_text(df_test, "BODY")

In [8]:
df_train.head()

Unnamed: 0,BODY,REMOVED
0,always be wary of news articles that cite unpu...,0
1,the problem i have with this is that the artic...,0
2,this is indicative of a typical power law and ...,0
3,this doesn t make sense chess obviously transl...,0
4,number i dispute that gene engineering is bur...,0


In [9]:
df_test.head()

Unnamed: 0,BODY,REMOVED
0,is there any veracity to the claims that peopl...,0
1,welcome to the number s head of nasa,1
2,hold on while i stroke my ego,1
3,i wonder if this is associated with majority o...,0
4,so just like everyone everywhere then questmark,1


### Проходимся по текстам стеммером — приводим все слова к нормальной форме

In [10]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize

# def stem(word):
#     regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
#     stem, suffix = re.findall(regexp, word)[0]
#     return stem

In [11]:
import Stemmer

stemobject = Stemmer.Stemmer('english')

def stemmer(x):
    stem_function = stemobject.stemWord
    out = [stem_function(word) for word in x.split(' ')]
    return ' '.join(out)

### Создаём матрицу TfIdf с помощью TfidfVectorizer.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from stop_words import get_stop_words
en_stopwords = get_stop_words('en')

In [14]:
tfidf = TfidfVectorizer(
#     tokenizer=nltk.word_tokenize,
#     preprocessor = stemmer,
#     stop_words="english",
#     stop_words=en_stopwords
)
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [15]:
%%time
X_train = tfidf.fit_transform(df_train['BODY'])
y_train = df_train['REMOVED']
X_test = tfidf.transform(df_test['BODY']) 
y_test = df_test['REMOVED']

Wall time: 1.62 s


### Применяем Logistic Regression к получившейся матрице tfidf, проводим классификацию (с помощью кросс-валидации), выводим получившиеся метрики качества (accuracy, classification_report).

In [16]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix 

In [17]:
%%time
clf = LogisticRegressionCV()
clf.fit(X_train, y_train)

Wall time: 19 s


In [18]:
accuracy_score(y_test, clf.predict(X_test))

0.753339895935874

In [19]:
print(classification_report(y_test, clf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.78      0.88      0.83      4812
          1       0.66      0.49      0.56      2299

avg / total       0.74      0.75      0.74      7111



In [20]:
confusion_matrix(y_test, clf.predict(X_test))

array([[4235,  577],
       [1177, 1122]], dtype=int64)

In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, clf.predict(X_test)).ravel()
(tn, fp, fn, tp)

(4235, 577, 1177, 1122)

### Оптимизация и подбор наилучших гиперпатаметров и способа предобработки текстов.

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import make_pipeline

In [348]:
pipeline = make_pipeline(TfidfVectorizer(stop_words="english"), 
                         LogisticRegression())
params = {'logisticregression__C': (0.1, 1, 10),
          "tfidfvectorizer__ngram_range": ((1, 1), (1, 2)),
          "tfidfvectorizer__max_df": (0.5, 0.75, 1.0),
          "tfidfvectorizer__min_df":(1, 5, 10, 20, 50),
          "tfidfvectorizer__max_features":(None, 5000, 10000, 50000)}

grid = GridSearchCV(pipeline, param_grid=params, cv=5)

In [349]:
pipeline = make_pipeline(TfidfVectorizer(stop_words="english"), 
                         LogisticRegression())
params = {'logisticregression__C': (0.1, 1, 10),
          "tfidfvectorizer__preprocessor":(None, stemmer),        
          "tfidfvectorizer__ngram_range": ((1, 1), (1, 2), (1,3)),
          "tfidfvectorizer__max_df": (0.5, 0.75, 1.0),
          "tfidfvectorizer__min_df":(1, 5, 10, 20, 50),
          "tfidfvectorizer__max_features":(None, 5000, 10000, 50000)}

grid2 = GridSearchCV(pipeline, param_grid=params, cv=5)

**Выполняется около одного часа (без стеминга)**
- Wall time: 1h 36min 37s

In [354]:
%%time
# grid.fit(df_train['BODY'], y_train)
print(grid.best_params_)

{'logisticregression__C': 10, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': None, 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 2)}
Wall time: 1h 36min 37s


In [355]:
grid.score(df_test['BODY'], y_test)

0.7450428912951765

In [356]:
print(classification_report(y_test, grid.predict(df_test['BODY'])))

             precision    recall  f1-score   support

          0       0.77      0.90      0.83      4812
          1       0.66      0.43      0.52      2299

avg / total       0.73      0.75      0.73      7111



In [357]:
print(confusion_matrix(y_test, grid.predict(df_test['BODY'])))
tn, fp, fn, tp = confusion_matrix(y_test, grid.predict(df_test['BODY'])).ravel()
(tn, fp, fn, tp)

[[4318  494]
 [1319  980]]


(4318, 494, 1319, 980)

**Выполняется**
- Wall time: 5h 47min 9s

In [254]:
%%time
# Wall time: 5h 47min 9s
### grid2.fit(df_train['BODY'], y_train) 
print(grid2.best_params_)

{'logisticregression__C': 10, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': None, 'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__preprocessor': <function stemmer at 0x000000001E7B8E18>}
Wall time: 5h 47min 9s


In [255]:
grid2.score(df_test['BODY'], y_test)

0.7446210097032766

In [256]:
print(classification_report(y_test, grid2.predict(df_test['BODY'])))

             precision    recall  f1-score   support

          0       0.77      0.89      0.83      4812
          1       0.66      0.43      0.52      2299

avg / total       0.73      0.74      0.73      7111



In [262]:
print(confusion_matrix(y_test, grid2.predict(df_test['BODY'])))
tn, fp, fn, tp = confusion_matrix(y_test, grid2.predict(df_test['BODY'])).ravel()
(tn, fp, fn, tp)

[[4301  511]
 [1305  994]]


(4301, 511, 1305, 994)

### Параметры наилучшей модели и метрики классификации.

In [5]:
parametres_models = pd.DataFrame({'LogisticRegressionCV': [0.7533, 'none'],
                                  'SGDClassifier': [0.7544, 'none'],
                                  'GridSearchCV': [0.7450, 0.7446],
                                  'model': ['not stemmer', 'with stemmer']})
parametres_models

Unnamed: 0,GridSearchCV,LogisticRegressionCV,SGDClassifier,model
0,0.745,0.7533,0.7544,not stemmer
1,0.7446,none,none,with stemmer


### SGDClassifier

In [362]:
%%time
from sklearn.linear_model import  SGDClassifier
sgd_logit = SGDClassifier(
#     max_iter=(10 ** 6 / X_train.shape[0]), 
                          random_state=17, 
                          n_jobs=-1)


sgd_logit.fit(X_train, y_train)

print(accuracy_score(y_test, sgd_logit.predict(X_test)))

print(classification_report(y_test, sgd_logit.predict(X_test)))

print(confusion_matrix(y_test, sgd_logit.predict(X_test)))
tn, fp, fn, tp = confusion_matrix(y_test, sgd_logit.predict(X_test)).ravel()
print('TN', tn)
print('FP', fp)
print('FN', fn)
print('TP', tp)

0.7544649135142737
             precision    recall  f1-score   support

          0       0.77      0.91      0.83      4812
          1       0.69      0.43      0.53      2299

avg / total       0.75      0.75      0.74      7111

[[4378  434]
 [1312  987]]
TN 4378
FP 434
FN 1312
TP 987
Wall time: 73 ms
