In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000)
# word_vectorizer.fit(data)
# word_features = word_vectorizer.transform(data)

# 数据向量化
print("Creating the tfidf vector...\n")
word_vectorizer.fit(train_df['comment_text'])
x_train = word_vectorizer.transform(train_df['comment_text'])
x_train = x_train.toarray()

x_test = word_vectorizer.transform(test_df['comment_text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

Creating the tfidf vector...

(159571, 5000)
(153164, 5000)


In [4]:
# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     analyzer='char',
#     stop_words='english',
#     ngram_range=(2, 6),
#     max_features=50000)
# char_vectorizer.fit(data)
# char_features = sentence_vectorizer.transform(data)

In [5]:
# X_train = hstack([word_features[:train.shape[0]], sentence_features[:train.shape[0]]])
# X_test = hstack([word_features[train.shape[0]:], sentence_features[train.shape[0]:]])

In [9]:
scores = []
result = pd.read_csv('../data/sample_submission.csv')
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in labels:
    y_train = train_df[label]
    classifier = LogisticRegression(solver='liblinear')

    cv_score = np.mean(cross_val_score(classifier, x_train, y_train, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    print('cv score for class {} is {}'.format(label, cv_score))

    classifier.fit(x_train, y_train)
    result[label] = classifier.predict_proba(x_test)[:, 1]

print('total cv score is {}'.format(np.mean(scores)))

cv score for class toxic is 0.9638751469406277
cv score for class severe_toxic is 0.9853885935892832
cv score for class obscene is 0.9818291783039268
cv score for class threat is 0.9824534504878771
cv score for class insult is 0.9723392247198277
cv score for class identity_hate is 0.9725132056972849
total cv score is 0.9763997999564714


In [12]:
result.to_csv('../data/ml_submission.csv', index=False)