In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
from bs4 import BeautifulSoup
import re
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sample_submission.csv')

In [3]:
def clean_comment(row):        
    input=row['comment_text']
    cleaned_html=BeautifulSoup(input).get_text()
    cleaned_special_chr=re.sub('[^a-zA-Z]',' ',cleaned_html)
    cleaned_special_chr = re.sub(r'[?|$|.|!]',r'',cleaned_special_chr)
    lower=cleaned_special_chr.lower().split()
    clean_stop_word=[each_char for each_char  in lower if each_char not in set(stopwords.words("english"))]
    return ' '.join(clean_stop_word)
print("pre train")
train['comment_text'] = train.apply(clean_comment,axis = 1)
print("pre test")
test['comment_text'] = test.apply(clean_comment,axis=1)

pre train




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


pre test


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [4]:
xtrain = train.comment_text.values
xtest = test.comment_text.values
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.comment_text.values)

In [None]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xtest))
print("fitting training data")
xtrain_tfv =  tfv.transform(xtrain) 
print("fitting testing data")
xtest_tfv = tfv.transform(xtest)

fitting training data
fitting testing data


In [None]:
predictions = OneVsRestClassifier(LogisticRegression(C=1.0)).fit(xtrain_tfv, y).predict_proba(xtest_tfv)

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xtest))
xtrain_ctv =  ctv.transform(xtrain) 
xtest_ctv = ctv.transform(xtest)

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, y)
predictions1 = clf.predict_proba(xtest_ctv)

In [10]:
embeddings_index = {}
f = open('../input/glove.840B.300d.txt',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


2196017it [03:02, 12065.20it/s]


In [13]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [14]:
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xtest_glove = [sent2vec(x) for x in tqdm(xtest)]


  0%|          | 0/159571 [00:00<?, ?it/s][A
  0%|          | 7/159571 [00:00<38:12, 69.62it/s][A
  0%|          | 197/159571 [00:00<02:42, 981.95it/s][A
  0%|          | 425/159571 [00:00<01:52, 1409.71it/s][A
  0%|          | 619/159571 [00:00<01:43, 1541.71it/s][A
  1%|          | 801/159571 [00:00<01:39, 1596.94it/s][A
  1%|          | 1019/159571 [00:00<01:33, 1693.60it/s][A
  1%|          | 1233/159571 [00:00<01:30, 1756.26it/s][A
  1%|          | 1432/159571 [00:00<01:28, 1784.66it/s][A
  1%|          | 1664/159571 [00:00<01:25, 1844.04it/s][A
  1%|          | 1865/159571 [00:01<01:24, 1855.63it/s][A
  1%|▏         | 2065/159571 [00:01<01:25, 1847.80it/s][A
  1%|▏         | 2309/159571 [00:01<01:22, 1896.02it/s][A
  2%|▏         | 2517/159571 [00:01<01:22, 1893.70it/s][A
  2%|▏         | 2732/159571 [00:01<01:22, 1911.33it/s][A
  2%|▏         | 2938/159571 [00:01<01:21, 1910.21it/s][A
  2%|▏         | 3158/159571 [00:01<01:21, 1927.70it/s][A
  2%|▏         | 33

 17%|█▋        | 27692/159571 [00:14<01:10, 1882.40it/s][A
 17%|█▋        | 27891/159571 [00:14<01:09, 1882.11it/s][A
 18%|█▊        | 28109/159571 [00:14<01:09, 1883.85it/s][A
 18%|█▊        | 28309/159571 [00:15<01:09, 1883.67it/s][A
 18%|█▊        | 28516/159571 [00:15<01:09, 1884.89it/s][A
 18%|█▊        | 28730/159571 [00:15<01:09, 1886.39it/s][A
 18%|█▊        | 28961/159571 [00:15<01:09, 1889.05it/s][A
 18%|█▊        | 29174/159571 [00:15<01:08, 1890.51it/s][A
 18%|█▊        | 29410/159571 [00:15<01:08, 1893.51it/s][A
 19%|█▊        | 29638/159571 [00:15<01:08, 1895.87it/s][A
 19%|█▊        | 29859/159571 [00:15<01:08, 1896.72it/s][A
 19%|█▉        | 30074/159571 [00:15<01:08, 1898.18it/s][A
 19%|█▉        | 30289/159571 [00:15<01:08, 1899.28it/s][A
 19%|█▉        | 30519/159571 [00:16<01:07, 1901.73it/s][A
 19%|█▉        | 30737/159571 [00:16<01:07, 1901.62it/s][A
 19%|█▉        | 30948/159571 [00:16<01:07, 1902.89it/s][A
 20%|█▉        | 31158/159571 [00:16<01:

 34%|███▍      | 54843/159571 [00:29<00:56, 1838.68it/s][A
 34%|███▍      | 55051/159571 [00:29<00:56, 1839.29it/s][A
 35%|███▍      | 55261/159571 [00:30<00:56, 1840.03it/s][A
 35%|███▍      | 55470/159571 [00:30<00:56, 1840.85it/s][A
 35%|███▍      | 55669/159571 [00:30<00:56, 1840.59it/s][A
 35%|███▌      | 55861/159571 [00:30<00:56, 1840.82it/s][A
 35%|███▌      | 56053/159571 [00:30<00:56, 1840.39it/s][A
 35%|███▌      | 56251/159571 [00:30<00:56, 1840.83it/s][A
 35%|███▌      | 56442/159571 [00:30<00:56, 1840.98it/s][A
 35%|███▌      | 56638/159571 [00:30<00:55, 1841.33it/s][A
 36%|███▌      | 56835/159571 [00:30<00:55, 1841.54it/s][A
 36%|███▌      | 57026/159571 [00:30<00:55, 1841.46it/s][A
 36%|███▌      | 57220/159571 [00:31<00:55, 1841.72it/s][A
 36%|███▌      | 57432/159571 [00:31<00:55, 1842.60it/s][A
 36%|███▌      | 57628/159571 [00:31<00:55, 1842.81it/s][A
 36%|███▌      | 57823/159571 [00:31<00:55, 1843.01it/s][A
 36%|███▋      | 58017/159571 [00:31<00:

 53%|█████▎    | 84602/159571 [00:43<00:38, 1923.16it/s][A
 53%|█████▎    | 84826/159571 [00:44<00:38, 1923.63it/s][A
 53%|█████▎    | 85093/159571 [00:44<00:38, 1925.31it/s][A
 53%|█████▎    | 85328/159571 [00:44<00:38, 1926.25it/s][A
 54%|█████▎    | 85564/159571 [00:44<00:38, 1927.19it/s][A
 54%|█████▍    | 85799/159571 [00:44<00:38, 1927.99it/s][A
 54%|█████▍    | 86032/159571 [00:44<00:38, 1928.47it/s][A
 54%|█████▍    | 86276/159571 [00:44<00:37, 1929.61it/s][A
 54%|█████▍    | 86519/159571 [00:44<00:37, 1930.65it/s][A
 54%|█████▍    | 86764/159571 [00:44<00:37, 1931.81it/s][A
 55%|█████▍    | 87002/159571 [00:45<00:37, 1932.21it/s][A
 55%|█████▍    | 87241/159571 [00:45<00:37, 1933.16it/s][A
 55%|█████▍    | 87489/159571 [00:45<00:37, 1934.36it/s][A
 55%|█████▍    | 87725/159571 [00:45<00:37, 1935.04it/s][A
 55%|█████▌    | 87967/159571 [00:45<00:36, 1936.10it/s][A
 55%|█████▌    | 88216/159571 [00:45<00:36, 1937.31it/s][A
 55%|█████▌    | 88455/159571 [00:45<00:

 73%|███████▎  | 116524/159571 [00:58<00:21, 1981.29it/s][A
 73%|███████▎  | 116751/159571 [00:58<00:21, 1981.75it/s][A
 73%|███████▎  | 116976/159571 [00:59<00:21, 1981.93it/s][A
 73%|███████▎  | 117212/159571 [00:59<00:21, 1982.56it/s][A
 74%|███████▎  | 117458/159571 [00:59<00:21, 1983.35it/s][A
 74%|███████▍  | 117693/159571 [00:59<00:21, 1983.93it/s][A
 74%|███████▍  | 117925/159571 [00:59<00:20, 1984.13it/s][A
 74%|███████▍  | 118173/159571 [00:59<00:20, 1984.95it/s][A
 74%|███████▍  | 118405/159571 [00:59<00:20, 1985.27it/s][A
 74%|███████▍  | 118653/159571 [00:59<00:20, 1986.08it/s][A
 75%|███████▍  | 118886/159571 [00:59<00:20, 1986.58it/s][A
 75%|███████▍  | 119118/159571 [00:59<00:20, 1987.05it/s][A
 75%|███████▍  | 119348/159571 [01:00<00:20, 1987.53it/s][A
 75%|███████▍  | 119609/159571 [01:00<00:20, 1988.56it/s][A
 75%|███████▌  | 119848/159571 [01:00<00:19, 1989.13it/s][A
 75%|███████▌  | 120092/159571 [01:00<00:19, 1989.85it/s][A
 75%|███████▌  | 120349/

 93%|█████████▎| 148532/159571 [01:12<00:05, 2042.21it/s][A
 93%|█████████▎| 148753/159571 [01:12<00:05, 2042.40it/s][A
 93%|█████████▎| 148973/159571 [01:12<00:05, 2042.48it/s][A
 94%|█████████▎| 149199/159571 [01:13<00:05, 2042.77it/s][A
 94%|█████████▎| 149435/159571 [01:13<00:04, 2043.19it/s][A
 94%|█████████▍| 149665/159571 [01:13<00:04, 2043.54it/s][A
 94%|█████████▍| 149903/159571 [01:13<00:04, 2043.98it/s][A
 94%|█████████▍| 150133/159571 [01:13<00:04, 2044.32it/s][A
 94%|█████████▍| 150363/159571 [01:13<00:04, 2044.65it/s][A
 94%|█████████▍| 150592/159571 [01:13<00:04, 2044.48it/s][A
 95%|█████████▍| 150826/159571 [01:13<00:04, 2044.85it/s][A
 95%|█████████▍| 151071/159571 [01:13<00:04, 2045.39it/s][A
 95%|█████████▍| 151324/159571 [01:13<00:04, 2046.03it/s][A
 95%|█████████▍| 151579/159571 [01:14<00:03, 2046.69it/s][A
 95%|█████████▌| 151821/159571 [01:14<00:03, 2046.82it/s][A
 95%|█████████▌| 152054/159571 [01:14<00:03, 2047.14it/s][A
 95%|█████████▌| 152290/

 14%|█▍        | 21864/153164 [00:09<00:55, 2361.88it/s][A
 14%|█▍        | 22120/153164 [00:09<00:55, 2362.03it/s][A
 15%|█▍        | 22371/153164 [00:09<00:55, 2360.86it/s][A
 15%|█▍        | 22615/153164 [00:09<00:55, 2361.59it/s][A
 15%|█▍        | 22859/153164 [00:09<00:55, 2361.79it/s][A
 15%|█▌        | 23126/153164 [00:09<00:54, 2364.86it/s][A
 15%|█▌        | 23381/153164 [00:09<00:54, 2366.62it/s][A
 15%|█▌        | 23632/153164 [00:09<00:54, 2367.45it/s][A
 16%|█▌        | 23901/153164 [00:10<00:54, 2370.59it/s][A
 16%|█▌        | 24156/153164 [00:10<00:54, 2370.88it/s][A
 16%|█▌        | 24407/153164 [00:10<00:54, 2369.94it/s][A
 16%|█▌        | 24668/153164 [00:10<00:54, 2372.08it/s][A
 16%|█▋        | 24917/153164 [00:10<00:54, 2371.00it/s][A
 16%|█▋        | 25183/153164 [00:10<00:53, 2373.74it/s][A
 17%|█▋        | 25440/153164 [00:10<00:53, 2375.53it/s][A
 17%|█▋        | 25692/153164 [00:10<00:53, 2376.05it/s][A
 17%|█▋        | 25941/153164 [00:10<00:

 37%|███▋      | 56119/153164 [00:23<00:40, 2393.47it/s][A
 37%|███▋      | 56393/153164 [00:23<00:40, 2394.88it/s][A
 37%|███▋      | 56649/153164 [00:23<00:40, 2395.40it/s][A
 37%|███▋      | 56898/153164 [00:23<00:40, 2395.37it/s][A
 37%|███▋      | 57144/153164 [00:23<00:40, 2394.83it/s][A
 37%|███▋      | 57396/153164 [00:23<00:39, 2395.35it/s][A
 38%|███▊      | 57666/153164 [00:24<00:39, 2396.58it/s][A
 38%|███▊      | 57926/153164 [00:24<00:39, 2397.23it/s][A
 38%|███▊      | 58179/153164 [00:24<00:39, 2397.52it/s][A
 38%|███▊      | 58431/153164 [00:24<00:39, 2398.00it/s][A
 38%|███▊      | 58682/153164 [00:24<00:39, 2396.78it/s][A
 38%|███▊      | 58922/153164 [00:24<00:39, 2395.07it/s][A
 39%|███▊      | 59178/153164 [00:24<00:39, 2395.72it/s][A
 39%|███▉      | 59416/153164 [00:24<00:39, 2395.28it/s][A
 39%|███▉      | 59653/153164 [00:24<00:39, 2395.16it/s][A
 39%|███▉      | 59919/153164 [00:25<00:38, 2396.21it/s][A
 39%|███▉      | 60188/153164 [00:25<00:

 59%|█████▉    | 90364/153164 [00:37<00:26, 2391.50it/s][A
 59%|█████▉    | 90624/153164 [00:37<00:26, 2392.05it/s][A
 59%|█████▉    | 90885/153164 [00:37<00:26, 2392.62it/s][A
 60%|█████▉    | 91143/153164 [00:38<00:25, 2392.24it/s][A
 60%|█████▉    | 91395/153164 [00:38<00:25, 2392.57it/s][A
 60%|█████▉    | 91655/153164 [00:38<00:25, 2393.10it/s][A
 60%|██████    | 91908/153164 [00:38<00:25, 2393.21it/s][A
 60%|██████    | 92158/153164 [00:38<00:25, 2392.77it/s][A
 60%|██████    | 92401/153164 [00:38<00:25, 2392.33it/s][A
 60%|██████    | 92655/153164 [00:38<00:25, 2392.68it/s][A
 61%|██████    | 92897/153164 [00:38<00:25, 2392.50it/s][A
 61%|██████    | 93136/153164 [00:38<00:25, 2392.30it/s][A
 61%|██████    | 93373/153164 [00:39<00:24, 2392.23it/s][A
 61%|██████    | 93625/153164 [00:39<00:24, 2392.55it/s][A
 61%|██████▏   | 93876/153164 [00:39<00:24, 2392.77it/s][A
 61%|██████▏   | 94130/153164 [00:39<00:24, 2393.14it/s][A
 62%|██████▏   | 94377/153164 [00:39<00:

 81%|████████  | 124048/153164 [00:51<00:12, 2392.64it/s][A
 81%|████████  | 124306/153164 [00:51<00:12, 2392.48it/s][A
 81%|████████▏ | 124556/153164 [00:52<00:11, 2392.08it/s][A
 81%|████████▏ | 124798/153164 [00:52<00:11, 2384.52it/s][A
 82%|████████▏ | 125026/153164 [00:52<00:11, 2384.32it/s][A
 82%|████████▏ | 125254/153164 [00:52<00:11, 2384.11it/s][A
 82%|████████▏ | 125488/153164 [00:52<00:11, 2384.02it/s][A
 82%|████████▏ | 125707/153164 [00:52<00:11, 2383.47it/s][A
 82%|████████▏ | 125997/153164 [00:52<00:11, 2384.45it/s][A
 82%|████████▏ | 126239/153164 [00:52<00:11, 2384.51it/s][A
 83%|████████▎ | 126486/153164 [00:53<00:11, 2384.52it/s][A
 83%|████████▎ | 126747/153164 [00:53<00:11, 2384.90it/s][A
 83%|████████▎ | 126993/153164 [00:53<00:10, 2384.84it/s][A
 83%|████████▎ | 127236/153164 [00:53<00:10, 2384.58it/s][A
 83%|████████▎ | 127478/153164 [00:53<00:10, 2384.64it/s][A
 83%|████████▎ | 127773/153164 [00:53<00:10, 2385.67it/s][A
 84%|████████▎ | 128028/

In [16]:
xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

In [None]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, y)
predictions = clf.predict_proba(xvalid_test)