In [1]:
pip install flask

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('train.csv').fillna(' ').sample(frac=1)
train.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
98021,0c5b7bb471726b09,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
37854,650fb4a07c51f75a,RSV template \n\nI've created a similar templa...,0,0,0,0,0,0
75942,cb35229f646a1c77,"""\n\nNo, the statement is accurate. It's a ver...",0,0,0,0,0,0


In [2]:
train['y'] = train[class_names].max(axis=1).values
train['y'].value_counts()

0    143346
1     16225
Name: y, dtype: int64

In [3]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='y'), 
                                                    train['y'], test_size=0.33, random_state=0)

# save test

X_test.to_csv("X_test.csv", index=None)

y_test.to_csv("y_test.csv", index=None)

# save train

X_train.to_csv("X_train.csv", index=None)

y_train.to_csv("y_train.csv", index=None)

In [4]:
y_train.value_counts(normalize=True)

0    0.898084
1    0.101916
Name: y, dtype: float64

In [5]:
y_test.value_counts(normalize=True)

0    0.898802
1    0.101198
Name: y, dtype: float64

In [6]:
%%time
# соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

classifier = Pipeline([('comment_text', FeatureSelector(column='comment_text')), 
                       ('comment_text_tfidf', TfidfVectorizer(strip_accents='unicode',
                                                              analyzer='word',
                                                              token_pattern=r'\w{1,}',
                                                              stop_words='english',
                                                              max_features=10000)), 
                       ('clf', LogisticRegression(C=0.1))])

# запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=3, scoring='roc_auc')
cv_score = np.mean(cv_scores)
print(f'CV score is {cv_score}')

# обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

CV score is 0.9530465792068386
CPU times: user 38.9 s, sys: 1.24 s, total: 40.1 s
Wall time: 40.2 s


In [7]:
classifier.steps

[('comment_text', FeatureSelector(column='comment_text')),
 ('comment_text_tfidf',
  TfidfVectorizer(max_features=10000, stop_words='english',
                  strip_accents='unicode', token_pattern='\\w{1,}')),
 ('clf', LogisticRegression(C=0.1))]

In [8]:
import dill
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(classifier, f)

In [9]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [10]:
X_test.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,f59e5c2d5c62354d,"""\nI have moved the image of Piccolo Daimao to...",0,0,0,0,0,0
1,7e5f6eafd2cd3147,"Template:Geobox coor \n\nHi, I had to do an em...",0,0,0,0,0,0
2,6aabe44bc87751c8,"""\nI was happy to do it. Thank you! Triple M """,0,0,0,0,0,0


In [11]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [12]:
pipeline

Pipeline(steps=[('comment_text', FeatureSelector(column='comment_text')),
                ('comment_text_tfidf',
                 TfidfVectorizer(max_features=10000, stop_words='english',
                                 strip_accents='unicode',
                                 token_pattern='\\w{1,}')),
                ('clf', LogisticRegression(C=0.1))])

In [13]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [14]:
preds[:10]

array([0.04480144, 0.07680503, 0.04580728, 0.07648689, 0.06655328,
       0.09042557, 0.03165131, 0.03694045, 0.03618859, 0.05824623])

In [15]:
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.1808601467657604, F-Score=0.746, Precision=0.768, Recall=0.725


In [17]:
from flask import Flask, request, jsonify
import pandas as pd

In [18]:
# Загружаем обученные модели
with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [19]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [None]:
# Обработчики и запуск Flask
app = Flask(__name__)


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"


@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    comment_text = ""
    request_json = request.get_json()
    
    if request_json['comment_text']:
        comment_text = request_json['comment_text']
    
    
    print(comment_text)  
    preds = model.predict_proba(pd.DataFrame({"comment_text": [comment_text],
                                             }))
    data["predictions"] = preds[:, 1][0]
    data["comment_text"] = comment_text
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [04/Aug/2022 17:20:31] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [04/Aug/2022 17:20:31] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
