In [5]:
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import make_union
union = make_union(PCA(n_components=2), TruncatedSVD(n_components=2))

In [6]:
X = [[0., 1., 3], [2., 2., 5]]
Y = union.fit_transform(X)

In [7]:
Y

array([[ 1.5       ,  0.        ,  3.03954967,  0.87243213],
       [-1.5       ,  0.        ,  5.72586357, -0.46312679]])

In [34]:
import pandas as pd
import numpy as np
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_union
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [23]:
data_path = r'E:\kaggle\Toxic_Comment_Classification_Challenge'
test_path = os.path.join(data_path, r'test.csv')
train_path = os.path.join(data_path, r'train.csv')
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [24]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [25]:
def clean_text(s):
    s = re.sub(r'[^a-zA-Z\']',' ',s)
    s = s.lower()
    s = s.split(" ")
    return " ".join(s)
df_train['comment_text'] = df_train.comment_text.apply(clean_text)
df_test['comment_text'] = df_test.comment_text.apply(clean_text)

In [27]:
train_text = df_train['comment_text']
test_text = df_test['comment_text']
all_text = pd.concat((train_text, test_text), ignore_index=True)

In [32]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    max_features=30000,
    stop_words = 'english')
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=30000)
vectorizer = make_union(word_vectorizer, char_vectorizer)

vectorizer.fit(all_text)
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)

In [33]:
train_features[:2]

<2x60000 sparse matrix of type '<class 'numpy.float64'>'
	with 911 stored elements in Compressed Sparse Row format>

In [36]:
scores = []
submission = pd.DataFrame.from_dict({'id': df_test['id']})
for class_name in class_names:
    train_target = df_train[class_name]
    classifier = LogisticRegression(solver='sag')

    cv_score = np.mean(cross_val_score(
        classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))


CV score for class toxic is 0.9782396600246109
CV score for class severe_toxic is 0.9886757087297292
CV score for class obscene is 0.9908064187644962
CV score for class threat is 0.9890619415448509
CV score for class insult is 0.9829389870946611
CV score for class identity_hate is 0.9830692238892786
Total CV score is 0.9854653233412712


In [37]:
submission.to_csv(os.path.join(data_path,r'submission.csv'), index=False)