In [1]:
import numpy as np
import pandas as pd
from  nltk.corpus import stopwords
import nltk
from gensim.models.word2vec import Word2Vec
import string
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [2]:
train_df = pd.read_csv('./datasets/train.csv')
test_df = pd.read_csv('./datasets/test.csv')
test_labels_df = pd.read_csv('./datasets/test_labels.csv')
sample_df = pd.read_csv('./datasets/sample_submission.csv')

stop_words = stopwords.words('english') + list(string.punctuation)

In [3]:
lens = train_df['comment_text'].str.len()

In [4]:
lens.mean(), lens.std(), lens.max()

(394.0732213246768, 590.7202819048923, 5000)

In [5]:
# lens.hist()

In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_df['none'] = 1 - train_df[label_cols].max(axis=1)

In [7]:
train_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train_df.shape, test_df.shape

((159571, 9), (153164, 2))

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
id              153164 non-null object
comment_text    153164 non-null object
dtypes: object(2)
memory usage: 2.3+ MB


In [10]:
COMMENT = 'comment_text'
train_df[COMMENT].fillna("unknown", inplace=True)
test_df[COMMENT].fillna("unknown", inplace=True)

In [11]:
# model
import re, string

In [12]:
re_token = re.compile(f'[{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’]')

In [13]:
def tokensize(s):
    return re_token.sub('', s)

In [16]:
m, n = train_df.shape
vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokensize, min_df=3, max_df=0.9, 
                     strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)

In [17]:
train_term_doc = vec.fit_transform(train_df['comment_text'])

In [19]:
test_term_doc = vec.transform(test_df['comment_text'])

In [20]:
x = train_term_doc
test_x = test_term_doc

In [21]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)  #选择该属性为1的行(特征)
    return (p+1) / ((y==y_i).sum() + 1)

In [23]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [24]:
preds = np.zeros((len(test_df), len(label_cols)))

In [26]:
for i, j in enumerate(label_cols):
    print('fit', j)
    m, r = get_mdl(train_df[j])
    preds[:, i] = m.predict_proba(test_x.multiply(r))[:, 1]

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


In [28]:
submid = pd.DataFrame({'id': sample_df["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission.csv', index=False)

In [363]:
toxic = train_df.toxic

In [None]:
x[toxic == 1]