In [2]:
import numpy as np
import pandas as pd
import sklearn as sk

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score, accuracy_score

import joblib
from joblib import dump, load

pd.options.display.max_columns = 400
pd.options.display.max_rows = 200
pd.options.display.max_colwidth = 600
pd.options.display.precision = 10

In [3]:
df_train = pd.read_excel("C:/Users/tjdal/Desktop/fast_campus/Project/individual_project/Crowdflower_Search/__data/excel/train.xlsx").fillna("")

In [4]:
df_test = pd.read_excel("C:/Users/tjdal/Desktop/fast_campus/Project/individual_project/Crowdflower_Search/__data/excel/test.xlsx").fillna("")

In [5]:
test_sub = pd.read_excel("C:/Users/tjdal/Desktop/fast_campus/Project/individual_project/Crowdflower_Search/__data/excel/sampleSubmission.xlsx")

In [6]:
df_train = df_train.drop('Unnamed: 6', axis = 1)
df_train = df_train.drop('Unnamed: 7', axis = 1)
df_train = df_train.drop('relevance_variance', axis = 1)

In [7]:
df_test = df_test.drop('Unnamed: 4', axis = 1)
df_test = df_test.drop('Unnamed: 5', axis = 1)

### df_test / test_sub matching

In [8]:
print(len(df_test))
print(len(test_sub))

22511
22513


In [9]:
# submission 에는 있지만 test data 에는 존재하지 않는 'id' 확인

no_sub = []
for num in range(len(test_sub['id'])):
    if test_sub['id'][num] not in df_test['id'].values:
        no_sub.append(test_sub['id'][num])

print(no_sub)

[23102, 24760]


In [10]:
# 위의 'id' 값의 index 확인

for num in range(len(test_sub.values)):
    if test_sub['id'].values[num] in no_sub:
        print('index : ', test_sub.index[num], 'value : ', test_sub['id'].values[num])

index :  15947 value :  23102
index :  17085 value :  24760


In [11]:
# 존재하지 않는 'id' index 제거

test_sub = test_sub.drop(17085, axis=0)
test_sub = test_sub.drop(15947, axis=0)

In [12]:
print(test_sub['id'].values[15945])
print(test_sub['id'].values[15946])
print(test_sub['id'].values[15947])
print(test_sub['id'].values[15948])

23098
23101
23103
23104


In [13]:
print(test_sub['id'].values[17083])
print(test_sub['id'].values[17084])
print(test_sub['id'].values[17085])
print(test_sub['id'].values[17086])

24759
24761
24762
24764


In [14]:
y = df_train['median_relevance']

In [15]:
y = joblib.load('y.pkl')

### Corpus 제작 및 Question / Result 구분

In [16]:
train_corpus = df_train.drop('id', axis = 1)
train_corpus = train_corpus.drop('product_description', axis = 1)
train_corpus = train_corpus.drop('median_relevance', axis = 1)

In [17]:
test_corpus = df_test.drop('id', axis = 1)
test_corpus = test_corpus.drop('product_description', axis = 1)

In [18]:
df_corpus = pd.concat([train_corpus, test_corpus], axis=0)

In [19]:
df_corpus

Unnamed: 0,query,product_title
0,bridal shower decorations,Accent Pillow with Heart Design - Red/Black
1,led christmas lights,Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire
2,projector,ViewSonic Pro8200 DLP Multimedia Projector
3,wine rack,"Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle"
4,light bulb,Wintergreen Lighting Christmas LED Light Bulb (Pack of 25)
5,oakley polarized radar,Oakley Sunglasses - Radar Path Polished Black/Gray Sunglasses
6,boyfriend jeans,How To Make An American Quilt (DVD)
7,screen protector samsung,ZAGG InvisibleShield Cell Phone Screen Protector for Samsung Galaxy S4 Mini
8,pots and pans set,"Cook N Home Stainless Steel 4-Piece Pasta Cooker/ Steamer Multi-pots with Encapsulated Bottom, 8-Quart"
9,waffle maker,Presto FlipSide Electric Waffle Maker- 03510


In [20]:
def feature_merge(df_text):
    textdata = list(df_text.apply(lambda x:'%s %s' % (x['query'], x['product_title']), axis=1))
    return textdata

In [21]:
corpusdata = feature_merge(df_corpus)

In [22]:
train_q = list(df_train['query'])
test_q = list(df_test['query'])

In [23]:
train_r = list(df_train['product_title'])
test_r = list(df_test['product_title'])

In [24]:
def no_str_check(data):
    for num in range(len(data)):
        if type(data[num]) == float or type(data[num]) == int:
            data[num] = 'a'

In [25]:
no_str_check(test_r)

In [26]:
types = []
for num in range(len(test_r)):
    types.append(type(test_r[num]))

type_set = set(types)
print(type_set)    

{<class 'str'>}


### 소문자 변환

In [27]:
def lower_convert(data):
    for num in range(len(data)):
        data[num] = data[num].lower()

In [28]:
lower_convert(train_q)
lower_convert(train_r)
lower_convert(test_q)
lower_convert(test_r)
lower_convert(corpusdata)

### 알파벳 / 숫자 stopwords

In [29]:
import re

In [30]:
def alphabet_stopwords(data):
    for num in range(len(data)):
        data[num] = re.findall(r'[a-zA-Z1-9]+', data[num])
        
    for num in range(len(data)):
        data[num] = (" ").join(data[num])
    
    for num in range(len(data)):
        data[num] = re.findall(r'\w\w+', data[num])

In [31]:
%%time
alphabet_stopwords(train_q)
alphabet_stopwords(train_r)
alphabet_stopwords(test_q)
alphabet_stopwords(test_r)
alphabet_stopwords(corpusdata)

Wall time: 1.09 s


In [32]:
def data_join(list_data):
    for num in range(len(list_data)):
        list_data[num] = (" ").join(list_data[num])

In [33]:
data_join(train_q)
data_join(train_r)
data_join(test_q)
data_join(test_r)
data_join(corpusdata)

### 토큰 생성

In [34]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [35]:
def word_token(data):
    for num in range(len(data)):
        data[num] = word_tokenize(data[num])

In [36]:
%%time
word_token(train_q)
word_token(train_r)
word_token(test_q)
word_token(test_r)
word_token(corpusdata)

Wall time: 15 s


### Stemming

In [37]:
from nltk.stem import PorterStemmer

In [38]:
# 타 참가자의 코드 참고

def stemPorter(text):
            porter = PorterStemmer()
            stem_data = []
            for num in text:
                final_stem = []
                for word in num:
                    final_stem.append(porter.stem(word))
                stem_data.append(final_stem)
            return stem_data

In [39]:
%%time
train_q = stemPorter(train_q)
train_r = stemPorter(train_r)
test_q = stemPorter(test_q)
test_t = stemPorter(test_r)
corpusdata = stemPorter(corpusdata)

Wall time: 23.9 s


### TF-IDF

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
%%time
data_join(train_q)
data_join(train_r)
data_join(test_q)
data_join(test_r)
data_join(corpusdata)

Wall time: 97.3 ms


In [42]:
tfv = TfidfVectorizer(analyzer='word', token_pattern=r'\w+', ngram_range=(1, 3), stop_words = 'english')

In [43]:
tfv.fit(corpusdata)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='\\w+', tokenizer=None, use_idf=True,
        vocabulary=None)

In [44]:
train_Q = tfv.transform(train_q)
train_R = tfv.transform(train_r)
test_Q = tfv.transform(test_q)
test_R = tfv.transform(test_r)

In [45]:
from scipy.sparse import hstack

In [46]:
train_X = hstack((train_Q, train_R))

In [47]:
test_X = hstack((test_Q, test_R))

In [48]:
print(train_Q.shape)
print(train_R.shape)
print(test_Q.shape)
print(test_R.shape)
print(train_X.shape)
print(test_X.shape)

(10157, 254750)
(10157, 254750)
(22511, 254750)
(22511, 254750)
(10157, 509500)
(22511, 509500)


### Modeling

In [49]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn import pipeline
from sklearn import grid_search
from sklearn import metrics



In [50]:
svd = TruncatedSVD()
scl = StandardScaler()
xgb_model = xgb.XGBRegressor()    

In [51]:
clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('xgb', xgb_model)])

In [52]:
clf2 = pipeline.Pipeline([('svd', svd), ('xgb', xgb_model)])

In [53]:
clf3 = pipeline.Pipeline([('xgb', xgb_model)])

In [54]:
param_grid = {'xgb__max_depth':[5, 6, 7, 8, 9, 10], 'xgb__n_estimators':list(range(10, 101, 10)), 
              'xgb__learning_rate':[0.001, 0.01, 0.1], 'xgb__gamma':[0.1, 1.0, 10.0, 100.0], 'xgb__subsample':[0.75]}

In [55]:
from ml_metrics import quadratic_weighted_kappa

In [56]:
from sklearn.metrics import r2_score

In [57]:
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [58]:
model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring='r2',
                                     verbose=1, n_jobs=-1, iid=True, refit=True, cv=4)

In [59]:
model2 = grid_search.GridSearchCV(estimator = clf2, param_grid=param_grid, scoring='r2',
                                     verbose=1, n_jobs=-1, iid=True, refit=True, cv=4)

In [61]:
model3 = grid_search.GridSearchCV(estimator = clf3, param_grid=param_grid, scoring='r2',
                                     verbose=1, n_jobs=-1, iid=True, refit=True, cv=4)

In [343]:
%%time
reg_model = model.fit(train_X, y)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 27.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 43.9min finished


Wall time: 43min 58s


In [365]:
%%time
reg_model2 = model2.fit(train_X, y)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 36.6min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 43.1min finished


Wall time: 43min 10s


In [366]:
reg_model2.best_score_

0.11655279577750394

In [62]:
%%time
reg_model3 = model2.fit(train_X, y)

Fitting 4 folds for each of 720 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 36.4min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 42.9min finished


Wall time: 42min 58s


In [63]:
reg_model3.best_score_

0.11495273516491124