In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['embeddings', 'sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
import lightgbm as lgb

In [4]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ", train_df.shape)
print("Test shape : ", test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [5]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

## Number of words in the text ##
train_df["num_words"] = train_df["question_text"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["question_text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["question_text"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["question_text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["question_text"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["question_text"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
train_df["num_stopwords"] = train_df["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
test_df["num_stopwords"] = test_df["question_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

## Number of punctuations in the text ##
train_df["num_punctuations"] =train_df['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['question_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["question_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["question_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["question_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [7]:
## Truncate some extreme values for better visuals ##
train_df['num_words'].loc[train_df['num_words']>60] = 60 #truncation for better visuals
train_df['num_punctuations'].loc[train_df['num_punctuations']>10] = 10 #truncation for better visuals
train_df['num_chars'].loc[train_df['num_chars']>350] = 350 #truncation for better visuals

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
train_text = train_df['question_text']
test_text = test_df['question_text']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=5000)
word_vectorizer.fit(all_text)
train_tfidf = word_vectorizer.transform(train_text)
test_tfidf = word_vectorizer.transform(test_text)

In [9]:
eng_features = ['num_words', 'num_unique_words', 'num_chars', 
                'num_stopwords', 'num_punctuations', 'num_words_upper', 
                'num_words_title', 'mean_word_len']
train_ = train_df[eng_features]

In [10]:
from scipy.sparse import hstack, csr_matrix
train_ = hstack((csr_matrix(train_), train_tfidf))
print(train_.shape)

(1306122, 5008)


In [11]:
test_ = test_df[eng_features]
test_ = hstack((csr_matrix(test_), test_tfidf))
print(test_.shape)

(375806, 5008)


In [12]:
train_y = train_df["target"].values

x_train, x_val, y_train, y_val = model_selection.train_test_split(train_, train_y, test_size=0.2, random_state=42)

print(x_train.shape, x_val.shape)

(1044897, 5008) (261225, 5008)


In [13]:
%%time
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

params = {'application': 'binary',
          'metric': 'binary_logloss',
          'learning_rate': 0.05,   
          'max_depth': 9,
          'num_leaves': 100,
          'verbosity': -1,
          'data_random_seed': 3,
          'bagging_fraction': 0.8,
          'feature_fraction': 0.4,
          'nthread': 16,
          'lambda_l1': 1,
          'lambda_l2': 1,
          'num_rounds': 2700,
          'verbose_eval': 100}

d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_val, label=y_val)
print('Train LGB')
num_rounds = params.pop('num_rounds')
verbose_eval = params.pop('verbose_eval')
model = lgb.train(params,
                  train_set=d_train,
                  num_boost_round=num_rounds,
                  valid_sets=[d_train, d_valid],
                  verbose_eval=verbose_eval,
                  valid_names=['train', 'val'],
                  feval=lgb_f1_score)
print('Predict')
pred_test_va = model.predict(x_val)

Train LGB


  'precision', 'predicted', average, warn_for)


[100]	train's binary_logloss: 0.155827	train's f1: 0.301909	val's binary_logloss: 0.156128	val's f1: 0.282632
[200]	train's binary_logloss: 0.146923	train's f1: 0.37326	val's binary_logloss: 0.148458	val's f1: 0.35076
[300]	train's binary_logloss: 0.141664	train's f1: 0.406796	val's binary_logloss: 0.144095	val's f1: 0.379636
[400]	train's binary_logloss: 0.137879	train's f1: 0.430888	val's binary_logloss: 0.14115	val's f1: 0.399964
[500]	train's binary_logloss: 0.135007	train's f1: 0.448233	val's binary_logloss: 0.139036	val's f1: 0.41228
[600]	train's binary_logloss: 0.132656	train's f1: 0.461935	val's binary_logloss: 0.137356	val's f1: 0.42436
[700]	train's binary_logloss: 0.130644	train's f1: 0.473417	val's binary_logloss: 0.136005	val's f1: 0.432314
[800]	train's binary_logloss: 0.128887	train's f1: 0.483113	val's binary_logloss: 0.134843	val's f1: 0.439383
[900]	train's binary_logloss: 0.127356	train's f1: 0.491789	val's binary_logloss: 0.133882	val's f1: 0.444655
[1000]	train's 

In [14]:
%%time
best_threshold = 0.01
best_score = 0.0
for threshold in range(1, 100):
    threshold = threshold / 100
    score = f1_score(y_val, pred_test_va > threshold)
    if score > best_score:
        best_threshold = threshold
        best_score = score
print(0.5, f1_score(y_val, pred_test_va > 0.5))
print(best_threshold, best_score)

0.5 0.48999186330349886
0.25 0.5897412782836021
CPU times: user 4.2 s, sys: 0 ns, total: 4.2 s
Wall time: 4.2 s


In [None]:
%%time
pred_test_y = model.predict(test_)



In [None]:
submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": (pred_test_y > best_threshold).astype(np.int)})
submit_df.head()

In [None]:
submit_df['prediction'].value_counts()

In [None]:
submit_df.to_csv("submission.csv", index=False)