In [1]:
import os
from datetime import datetime

# Read data

In [2]:
encoding = 'latin' # utf8 cannot decode
train_path = './email_classification_data/train_data'
X_train, y_train = [], []
ham_path = train_path + '/ham'
spam_path = train_path + '/spam'
for file in os.listdir(ham_path):
    s = open(ham_path + '/' + file, encoding=encoding).read()
    X_train.append(s)
    y_train.append(0)
for file in os.listdir(spam_path):
    s = open(spam_path + '/' + file, encoding=encoding).read()
    X_train.append(s)
    y_train.append(1)

test_path = './email_classification_data/test_data'
X_test = []
idx_test = []
for file in os.listdir(test_path):
    s = open(test_path + '/' + file, encoding=encoding).read()
    X_test.append(s)
    idx_test.append(file[11:-4])

In [3]:
len(X_train)

4372

In [4]:
len(X_test)

800

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [7]:
len(X_trn)

3497

In [8]:
len(X_val)

875

In [9]:
len(y_val)

875

In [10]:
count_vect = CountVectorizer()
X_trn_counts = count_vect.fit_transform(X_trn)
X_trn_counts.shape

(3497, 40780)

# Experiments

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import csv

In [12]:
tfidf_transformer = TfidfTransformer()
X_trn_tfidf = tfidf_transformer.fit_transform(X_trn_counts)
X_val_counts = count_vect.transform(X_val)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)
clf = MultinomialNB().fit(X_trn_tfidf, y_trn)
pred_y_val = clf.predict(X_val_tfidf)
print(classification_report(y_true = y_val, y_pred = pred_y_val))

             precision    recall  f1-score   support

          0       0.85      1.00      0.92       641
          1       1.00      0.53      0.70       234

avg / total       0.89      0.88      0.86       875



### easy pipelines

In [13]:
def validate(txt_clf):
    scores = cross_val_score(estimator = txt_clf, X = X_train, y = y_train, cv=5)
    print(scores)
    print(np.mean(scores))

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
txt_clf =  Pipeline([('vect', CountVectorizer(max_df = 0.5, max_features = None, ngram_range = (1,2))),
                     ('tfidf', TfidfTransformer(use_idf = True, norm = 'l2')),
                     ('clf', SGDClassifier(penalty = 'l2',n_iter = 10, alpha = 1e-05 ))])
validate(txt_clf)
# produce(txt_clf)




[ 0.97371429  0.98171429  0.9805492   0.98283753  0.96567506]
0.976898071265


In [27]:
# Best score: 0.980
# Best parameters set:
# 	clf__alpha: 1e-05
# 	clf__n_iter: 10
# 	clf__penalty: 'l2'
# 	tfidf__norm: 'l2'
# 	tfidf__use_idf: True
# 	vect__max_df: 0.5
# 	vect__max_features: None
# 	vect__ngram_range: (1, 2)


parameters = {
    'vect__max_df': (0.5, 0.75),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
### the following paramter of clf must define according to different model
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}
gridSearchParameter(txt_clf, parameters)
# randomSearchParameter(txt_clf, parameters, 10)

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__n_iter': (10, 50, 80),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed: 16.8min finished


done in 1011.785s

Best score: 0.964
Best parameters set:
	clf__alpha: 1e-05
	clf__n_iter: 10
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__max_features: 50000
	vect__ngram_range: (1, 2)


In [19]:
from sklearn.model_selection import cross_val_score
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('clf', LogisticRegression())])
validate(txt_clf)

[ 0.96        0.968       0.97254005  0.96910755  0.95423341]
0.964776201373


In [20]:
txt_clf =  Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.1))])
validate(txt_clf)

[ 0.96914286  0.98057143  0.97368421  0.96567506  0.96453089]
0.970720889179


In [21]:
txt_clf =  Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=1.0))])
validate(txt_clf)

[ 0.97028571  0.98171429  0.96796339  0.96567506  0.96453089]
0.970033867277


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
txt_clf =  Pipeline([
#                      ('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                    ('TFIDF_VEC', TfidfVectorizer()),
                     ('clf', MultinomialNB(alpha=0.01))])
validate(txt_clf)

[ 0.968       0.98057143  0.97597254  0.97025172  0.96224256]
0.971407649559


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
txt_clf =  Pipeline([
    ('TFIDF_VEC', TfidfVectorizer(norm='l2', sublinear_tf=True)),
    ('clf', MultinomialNB(alpha=0.1))
])
validate(txt_clf)

[ 0.96685714  0.97371429  0.96453089  0.96224256  0.96567506]
0.966603988231


In [24]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())])
validate(txt_clf)

[ 0.94285714  0.936       0.92791762  0.91647597  0.91075515]
0.926801176855


In [25]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('clf', RandomForestClassifier())])
validate(txt_clf)

[ 0.93942857  0.92342857  0.95308924  0.93363844  0.90846682]
0.931610330173


In [None]:
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', AdaBoostClassifier())])
validate(txt_clf)

In [None]:
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('clf', AdaBoostClassifier())])
validate(txt_clf)

In [None]:
txt_clf =  Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())])
validate(txt_clf)

###  Gridsearch and Randomsearch for best parameters

In [16]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time

def gridSearchParameter(pipline, parameters):
    
    grid_search = GridSearchCV(pipline, parameters, n_jobs=-1, verbose=1, cv=5)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


In [17]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from time import time

def randomSearchParameter(pipline, parameters, max_iter = 5):
    
    random_search = RandomizedSearchCV(pipline, parameters, n_jobs=-1, verbose=1, cv=5, n_iter = max_iter)

    print("Performing random search...")
    print("pipeline:", [name for name, _ in pipline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    random_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % random_search.best_score_)
    print("Best parameters set:")
    best_parameters = random_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


# model ensemble
失败了。。我没调好你可以删了

In [26]:
# # lb = 0.97500
txt_clf1 =  Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())])

# # lb = 0.98750
txt_clf2 =  Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.1))])

# # lb = 0.97500 ********************** best leaderboard
txt_clf3 =  Pipeline([
    ('TFIDF_VEC', TfidfVectorizer(norm='l2', sublinear_tf=True)),
    ('clf', MultinomialNB(alpha=0.1))
])

# # lb = 0.97916
txt_clf4 =  Pipeline([('vect', CountVectorizer()),
                     ('clf', LogisticRegression())])

In [47]:
a = [0, 1, 3, 4, 5, 5]

In [48]:
train = [1, 2, 4]

In [52]:
a[train]

TypeError: only integer scalar arrays can be converted to a scalar index

In [51]:
from sklearn.cross_validation import StratifiedKFold
n_folds = 5
skf = list(StratifiedKFold(y_train, n_folds))
## model put here:
clfs = [txt_clf1, txt_clf2, txt_clf3, txt_clf4]

dataset_blend_train = np.zeros((len(X_train), len(clfs)))
dataset_blend_test = np.zeros((len(X_test), len(clfs)))

for j, clf in enumerate(clfs):
    print (j, clf)
    dataset_blend_test_j = np.zeros((len(X_test), len(skf)))
    for i, (train, test) in enumerate(skf):
        X_train_fold = X_train[train]
        y_train_fold = y_train[train]
        X_test_fold = X_train[test]
        y_test_fold = y_train[test]
        clf.fit(X_train_fold, y_train_fold)
        y_submission = clf.predict_proba(X_test_fold)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

print
print ("Blending.")
clf = LogisticRegression()
clf.fit(dataset_blend_train, y_train)
# y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
y_submission = clf.predict(dataset_blend_test)[:, 1]

0 Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])


TypeError: only integer scalar arrays can be converted to a scalar index

# Produce answer

In [None]:
txt_clf = None

In [23]:
def produce(txt_clf):
    txt_clf.fit(X_train, y_train)
    pred_y_test = txt_clf.predict(X_test)
    answer = np.stack([idx_test, pred_y_test], axis=-1)
    print(answer)
    with open('output/' + datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + '.csv', 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerow(['email_id', 'labels'])
        writer.writerows(answer)

In [None]:
# # lb = 0.97500
# txt_clf =  Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', LogisticRegression())])

# # lb = 0.98750
# txt_clf =  Pipeline([('vect', CountVectorizer()),
# #                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB(alpha=0.1))])

# # lb = 0.97500 ********************** best leaderboard
# txt_clf =  Pipeline([
#     ('TFIDF_VEC', TfidfVectorizer(norm='l2', sublinear_tf=True)),
#     ('clf', MultinomialNB(alpha=0.1))
# ])

# # lb = 0.97916
# txt_clf =  Pipeline([('vect', CountVectorizer()),
#                      ('clf', LogisticRegression())])



produce(txt_clf)