In [1]:
%load_ext klab-autotime

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

time: 3.8 ms


In [3]:
import os
import pandas as pd
import numpy as np
Data_path='/home/kesci/input/bytedance/'
train_path=os.path.join(Data_path,"train_final.csv")
data_train=pd.read_csv(train_path,header=None,skiprows=800000000,nrows =200000000)
data_train = reduce_mem_usage(data_train)

Mem. usage decreased to 4196.17 Mb (45.0% reduction)
time: 16min 56s


In [4]:
data_train.columns=["query_id","query","query_title_id","title","label"]

time: 856 µs


In [5]:
import gc
data_train['label']=data_train['label'].apply(lambda x:int(x))
label=data_train['label'].values

time: 1min 32s


In [6]:
corpus_train=data_train['title']

time: 554 µs


In [7]:
del data_train
gc.collect()

14

time: 65.6 ms


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
tfidf_vec = TfidfVectorizer(ngram_range=(2,2)).fit(corpus_train) 
vec_train=tfidf_vec.transform(corpus_train)

time: 3h 3min 43s


In [9]:
del corpus_train
gc.collect()

0

time: 1min 48s


In [None]:
from sklearn.linear_model.logistic import LogisticRegression
lr=LogisticRegression(random_state=2019)
lr.fit(vec_train,label)



In [None]:
del vec_train
gc.collect()

In [12]:
test_path=os.path.join(Data_path,"bytedance_contest.final_2.csv")
test_df=pd.read_csv(test_path,header=None)
test_df = reduce_mem_usage(test_df)
test_df.columns=["query_id","query","query_title_id","title"]
sub=test_df[['query_id','query_title_id']]

Mem. usage decreased to 2002.72 Mb (34.4% reduction)
time: 3min 10s


In [13]:
corpus_test=test_df['title']
del test_df
gc.collect()

14

time: 28.5 ms


In [14]:
vec_test=tfidf_vec.transform(corpus_test)
del corpus_test
gc.collect()

0

time: 51min 26s


In [15]:
sub['prediction']=lr.predict_proba(vec_test)[:,1]
sub.to_csv('tfidf.csv',index=False,header=False)

time: 7min 40s


In [16]:
sub.head(50)

Unnamed: 0,query_id,query_title_id,prediction
0,1,1,0.195842
1,1,2,0.078095
2,1,3,0.206235
3,1,4,0.102851
4,1,5,0.210369
5,1,6,0.218956
6,1,7,0.107741
7,2,1,0.359315
8,2,2,0.357112
9,2,3,0.1434


time: 16.4 ms
