# Imports

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sps
import time
import math
from collections import Counter

# Carregando os dados

In [2]:
dtype={
       "region": object, 
       "city":object, 
       "parent_category_name":object,
       "category_name": object,
       "title":object,
       "description":object, 
       "price":np.float32, 
       "activation_date": object, 
       "user_type": object, 
       "image":object, 
       "image_top_1":np.float32, 
       "deal_probability":np.float32,
       "item_seq_number": np.uint,
       "item_id": object,
       "param_1": object,
       "param_2": object,
       "param_3": object,
       "user_id": object
      }

start = time.time()
df_train = pd.read_csv('./data/train_small.csv', dtype = dtype, encoding='utf8')
df_test  = pd.read_csv('./data/test_small.csv' , dtype = dtype, encoding='utf8')
end = time.time()

print('Tempo de carregamento dos CSVs: '+str("%.2f" % (end - start))+'s\n')


text_columns        = ["title","description"]
categorical_columns = ["user_type","parent_category_name","category_name", "user_id","param_1","param_2","param_3"]
numeric_columns     = ["price","image_top_1","item_seq_number"]

matrizes_train_resultado = []
matrizes_test_resultado = []


Tempo de carregamento dos CSVs: 0.01s



# Tratando dados faltantes

In [3]:
start = time.time()
# Colunas de texto faltantes preenchidas com strings vazias
for column in text_columns:
    df_train[column].fillna(value='', inplace=True)
    df_test[column].fillna(value='', inplace=True)

# Colunas numéricas faltantes preenchidas com a média
for column in numeric_columns:
    df_train[column].fillna(value = df_train[column].mean(), inplace=True)
    df_test[column].fillna(value = df_test[column].mean(), inplace=True)

# Colunas categóricas faltantes preenchidas com a categoria mais frequente
for column in categorical_columns:
    counter = Counter(df_train[column]) 
    mais_frequente = counter.most_common()[0][0]
    if (type(mais_frequente) == str and mais_frequente == 'nan') or (type(mais_frequente) == float and math.isnan(mais_frequente)):  
        categoria_frequente = counter.most_common()[1][0]
    else:
        categoria_frequente = mais_frequente
        
    counter = None
        
    df_train[column].fillna(value = categoria_frequente, inplace=True)
    df_test[column].fillna(value = categoria_frequente, inplace=True)
    
end = time.time()

print('Tempo de tratamento de dados faltantes: '+str("%.2f" % (end - start))+'s\n')

Tempo de tratamento de dados faltantes: 0.01s



# Exibindo os dados

In [4]:
df_train.head(5)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,> 50 (XXL),Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,Обувь,> 50 (XXL),Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,> 50 (XXL),Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,Обувь,> 50 (XXL),Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


# Eliminando colunas desnecessárias

In [5]:
columns_to_drop =['item_id']
df_train.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)
df_train.head(5)

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,> 50 (XXL),Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,Обувь,> 50 (XXL),Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,> 50 (XXL),Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,Обувь,> 50 (XXL),Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


# Carregamento das features geradas pelas imagens

In [6]:
# Features calculadas em notebook externo

start = time.time()

# merge dados de treino
df_images_train = pd.read_csv('./data/train_jpg.csv', encoding='utf8')
df_train  = df_train.merge(df_images_train, left_on = "image", right_on = "id", copy=False)
df_images_train = None
df_train.drop(columns=["image","id"],inplace=True)

# merge dados de teste
df_test["image"].fillna(value="no-image", inplace=True)
df_images_test = pd.read_csv('./data/test_jpg.csv', encoding='utf8')
df_test  = df_test.merge(df_images_test, left_on = "image", right_on = "id", copy=False)
df_test.drop(columns=["image","id"],inplace=True)

df_images_test.drop(columns=["id"], inplace = True)
numeric_columns_images = df_images_test.columns.tolist()
numeric_columns += numeric_columns_images
df_images_test = None

# dados de teste sem imagens
for column in numeric_columns_images:
    df_test[column].fillna(value=df_test[column].mean(), inplace=True)

numeric_columns_images = None

end = time.time()

print('Tempo de merge com dataset de dados das imagens : '+str("%.2f" % (end - start))+'s\n')

df_train.head(5)

Tempo de merge com dataset de dados das imagens : 8.32s



Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,...,width,height,size,dullness,whiteness,average_red,average_green,average_blue,average_pixel_width,blurrness_score
0,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,> 50 (XXL),Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",...,358.0,480.0,27039.0,0.0,1.965,0.360951,0.320277,0.612833,2.329493,398.428606
1,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,Обувь,> 50 (XXL),Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",...,360.0,480.0,30385.0,6.335,0.0,0.297394,0.366578,0.422213,3.273727,1014.477548
2,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,> 50 (XXL),Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",...,392.0,360.0,18681.0,0.0,72.02,0.703338,0.703571,0.703576,2.649518,493.921065
3,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,Обувь,> 50 (XXL),Автокресло,Продам кресло от0-25кг,...,360.0,360.0,13656.0,0.0,94.325,0.851712,0.846464,0.846093,1.54784,377.127677
4,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,"ВАЗ 2110, 2003",Все вопросы по телефону.,...,640.0,360.0,36710.0,7.715,1.32,0.449327,0.512993,0.485592,2.469618,557.361892


# Processando colunas geográficas   

In [7]:
# Coordenadas das cidades fornecidas pelo usuário FrankHerfert
# Ref.: https://www.kaggle.com/frankherfert/region-and-city-details-with-lat-lon-and-clusters/data

start = time.time()

df_coords = pd.read_csv('./data/avito_region_city_features.csv', encoding='utf8')
df_coords.drop(columns=["region_id","city_region_id","region","city_region"], inplace=True)
df_train  = df_train.merge(df_coords, on = "city", copy=False)
df_test   = df_test.merge (df_coords, on = "city", copy=False)
df_train.drop(columns=["city","region"], inplace=True)
df_test.drop(columns=["city","region"], inplace=True)

df_coords.drop(columns=["city"], inplace = True)
numeric_columns += df_coords.columns.tolist()

# Liberando memória
df_coords = None

end = time.time()

print('Tempo de merge com dataset de dados geográficos : '+str("%.2f" % (end - start))+'s\n')

df_train.head(5)

Tempo de merge com dataset de dados geográficos : 0.03s



Unnamed: 0,user_id,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,...,average_red,average_green,average_blue,average_pixel_width,blurrness_score,latitude,longitude,lat_lon_hdbscan_cluster_05_03,lat_lon_hdbscan_cluster_10_03,lat_lon_hdbscan_cluster_20_03
0,e00f8ff2eaf9,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,> 50 (XXL),Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,...,0.360951,0.320277,0.612833,2.329493,398.428606,56.838926,60.605702,65,38,21
1,39aeb48f0017,Для дома и дачи,Мебель и интерьер,Другое,Обувь,> 50 (XXL),Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,...,0.297394,0.366578,0.422213,3.273727,1014.477548,53.241504,50.221246,58,36,19
2,91e2f88dd6e3,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,> 50 (XXL),Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,...,0.703338,0.703571,0.703576,2.649518,493.921065,47.235714,39.701505,46,40,9
3,3ce3c2318b16,Бытовая электроника,Товары для компьютера,Мониторы,Обувь,> 50 (XXL),Монитор acer 18.5,Продам манитор 18.5 v193HQV. В хорошем состоянии,2500.0,47,...,0.364382,0.389037,0.41409,1.414931,286.578111,47.235714,39.701505,46,40,9
4,6630d724bbf5,Личные вещи,Детская одежда и обувь,Для девочек,Обувь,25,Туфли moschino,"Очень красивые стильные туфельки, состояние от...",1300.0,59,...,0.501076,0.527293,0.538818,2.56713,1436.013879,47.235714,39.701505,46,40,9


# Processamento de colunas temporais

In [8]:
#Convertendo datas

start = time.time()

date_column_name = 'activation_date'
df_train[date_column_name] = df_train[date_column_name].astype('datetime64[ns]', copy=False)
df_test [date_column_name] = df_test[date_column_name].astype('datetime64[ns]', copy=False)

# Data será transforamda em três colunas: ano - 1970, mês e dia
df_train[date_column_name] = df_train[date_column_name].map(lambda t: [t.year -1970, t.month,t.day]) 
df_train['year']  = df_train[date_column_name].map(lambda t: t[0]) 
df_train['month'] = df_train[date_column_name].map(lambda t: t[1])
df_train['day']   = df_train[date_column_name].map(lambda t: t[2])
df_train.drop(columns=[date_column_name], inplace=True)

df_test[date_column_name] = df_test[date_column_name].map(lambda t: [t.year -1970, t.month,t.day]) 
df_test['year']  = df_test[date_column_name].map(lambda t: t[0]) 
df_test['month'] = df_test[date_column_name].map(lambda t: t[1])
df_test['day']   = df_test[date_column_name].map(lambda t: t[2])
df_test.drop(columns=[date_column_name], inplace=True)

numeric_columns_dates = ["year","month","day"]
numeric_columns += numeric_columns_dates

end = time.time()

print('Tempo de processamento da coluna de datas : '+str("%.2f" % (end - start))+'s\n')

df_train.head(5)

Tempo de processamento da coluna de datas : 0.03s



Unnamed: 0,user_id,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,...,average_pixel_width,blurrness_score,latitude,longitude,lat_lon_hdbscan_cluster_05_03,lat_lon_hdbscan_cluster_10_03,lat_lon_hdbscan_cluster_20_03,year,month,day
0,e00f8ff2eaf9,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,Обувь,> 50 (XXL),Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,...,2.329493,398.428606,56.838926,60.605702,65,38,21,47,3,28
1,39aeb48f0017,Для дома и дачи,Мебель и интерьер,Другое,Обувь,> 50 (XXL),Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,...,3.273727,1014.477548,53.241504,50.221246,58,36,19,47,3,26
2,91e2f88dd6e3,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",Обувь,> 50 (XXL),Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,...,2.649518,493.921065,47.235714,39.701505,46,40,9,47,3,20
3,3ce3c2318b16,Бытовая электроника,Товары для компьютера,Мониторы,Обувь,> 50 (XXL),Монитор acer 18.5,Продам манитор 18.5 v193HQV. В хорошем состоянии,2500.0,47,...,1.414931,286.578111,47.235714,39.701505,46,40,9,47,3,22
4,6630d724bbf5,Личные вещи,Детская одежда и обувь,Для девочек,Обувь,25,Туфли moschino,"Очень красивые стильные туфельки, состояние от...",1300.0,59,...,2.56713,1436.013879,47.235714,39.701505,46,40,9,47,3,20


# Processamento de texto corrido

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import RussianStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.core.display import HTML

exibir_dataframe = lambda dataframe: display(HTML(dataframe.head(5).to_html()))

#Stop words russas
stop_words_ru = open('data/stopwords.txt', encoding='utf8').read().split('\n')

def stemmed_words(doc):
    stemmer = RussianStemmer()
    analyzer = CountVectorizer().build_analyzer()
    return (stemmer.stem(w) for w in analyzer(doc))

#Pré processa os dados
for column in text_columns:
    
    start = time.time()
    
    # Removendo stop words
    # necessário passar tudo para minúsculo antes
    # df_train[column].replace(stop_words_ru, '', inplace=True)
    # df_test[column].replace(stop_words_ru, '', inplace=True)

    serie_all_data = pd.concat([df_train[column], df_test [column]],copy=False)
    
    # Criando tfIdfVectorizer
    tfidf = TfidfVectorizer(sublinear_tf=True, analyzer=stemmed_words,
                        lowercase=True, min_df = 2, max_df=0.8, dtype = np.float32)

    # fit 
    tfidf.fit(serie_all_data)
    serie_all_data = None

    # transform
    train_column_transformed = tfidf.transform(df_train[column])
    test_column_transformed  = tfidf.transform(df_test[column])
     
    # apagando colunas antigas
    df_train.drop(columns=[column],inplace=True)
    df_test.drop (columns=[column],inplace=True)
    
    # Salvando matrizes de resultado
    matrizes_train_resultado.append(train_column_transformed)
    matrizes_test_resultado.append(test_column_transformed)
    
    end = time.time()
    
    print('Tempo de processamento da coluna '+column+' : '+str("%.2f" % (end - start))+'s\n')

print("Coluna "+column+ " transformada em:")
exibir_dataframe(pd.DataFrame(train_column_transformed[:3].toarray()))

# Limpando memória
train_column_transformed = None
test_column_transformed  = None
tfidf = None

Tempo de processamento da coluna title : 0.05s

Tempo de processamento da coluna description : 0.28s

Coluna description transformada em:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447227,0.0,0.0,0.0,0.447227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447227,0.0,0.0,0.0,0.0,0.0,0.0,0.447227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34421,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.606255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.79527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441328,0.0,0.0,0.0,0.0,0.234824,0.0,0.0,0.0,0.0,0.0,0.441328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Processamento de colunas categóricas

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

def dropcols_coo(M, idx_to_drop):
    # Apaga coluna idx_to_drop em matriz esparsa M
    idx_to_drop = np.unique(idx_to_drop)
    C = M.tocoo()
    M = None
    keep = ~np.in1d(C.col, idx_to_drop)
    C.data, C.row, C.col = C.data[keep], C.row[keep], C.col[keep]
    C.col -= idx_to_drop.searchsorted(C.col)    
    C._shape = (C.shape[0], C.shape[1] - len(idx_to_drop))
    return C.tocsr()

num_linhas_treino = df_train.shape[0]

for column in categorical_columns:
    
    start = time.time()
    
    df_all_data = pd.DataFrame()
    df_all_data[column] = pd.concat([df_train[column],df_test [column]],copy=False)    
    df_all_data[column] = df_all_data[column].astype(str, copy = False)
    
     # fit labels
    label_enc = preprocessing.LabelEncoder() 
    label_enc.fit(df_all_data[column])
    
    # transform labels
    column_label_encoded = label_enc.transform(df_all_data[column]).reshape(-1,1) 
    df_all_data = None
        
    # fit e transform usando OneHotEncoder
    one_hot_enc = OneHotEncoder(dtype=np.uint8, sparse=True)
    column_transformed = one_hot_enc.fit_transform(column_label_encoded)
    
    # particionando resultado
    train_column_transformed = column_transformed[:num_linhas_treino]
    test_column_transformed  = column_transformed[num_linhas_treino:]
    column_transformed = None
    
    # removendo coluna linearmente dependente
    indice_ultima_coluna = train_column_transformed.shape[1]-1
    train_column_transformed = dropcols_coo(train_column_transformed, indice_ultima_coluna) 
    indice_ultima_coluna = test_column_transformed.shape[1]-1
    test_column_transformed = dropcols_coo(test_column_transformed, indice_ultima_coluna) 
    
    # apagando colunas antigas
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)
    
    # Salvando matrizes de resultado
    matrizes_train_resultado.append(train_column_transformed)
    matrizes_test_resultado.append(test_column_transformed)
    
    end = time.time()
    
    print('Tempo de processamento da coluna '+column+' : '+str("%.2f" % (end - start))+'s\n')

print("Coluna "+column+ " transformada em:")
exibir_dataframe(pd.DataFrame(train_column_transformed[:3].toarray()))

# Limpando memória
train_column_transformed = None
test_column_transformed  = None

Tempo de processamento da coluna user_type : 0.00s

Tempo de processamento da coluna parent_category_name : 0.00s

Tempo de processamento da coluna category_name : 0.02s

Tempo de processamento da coluna user_id : 0.00s

Tempo de processamento da coluna param_1 : 0.00s

Tempo de processamento da coluna param_2 : 0.02s

Tempo de processamento da coluna param_3 : 0.01s

Coluna param_3 transformada em:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


# Normalização dos dados

In [11]:
from sklearn.preprocessing import MinMaxScaler

start = time.time()

df_all_data = pd.concat([df_train[numeric_columns],df_test[numeric_columns]],copy=False)

# Escalando colunas numéricas 
scaler = MinMaxScaler(copy=False)
scaler.fit(df_all_data[numeric_columns])
train_matriz_scaled = scaler.transform(df_train[numeric_columns])
test_matriz_scaled  = scaler.transform(df_test[numeric_columns])
    
# Salvando matrizes de resultado
matrizes_train_resultado.append(train_matriz_scaled)
matrizes_test_resultado.append(test_matriz_scaled)

end = time.time()

print('Tempo de processamento para normalização dos dados : '+str("%.2f" % (end - start))+'s\n')

print("Coluna numéricas transformadas em:")
exibir_dataframe(pd.DataFrame(train_matriz_scaled[:3]))

# liberando memória
train_matriz_scaled = None
test_matriz_scaled = None
df_test     = None
df_all_data = None

Tempo de processamento para normalização dos dados : 0.01s

Coluna numéricas transformadas em:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0.000173,0.325783,1.7e-05,0.237838,1.0,0.206346,0.0,0.02034,0.368521,0.275627,0.596271,0.23657,0.099998,0.754106,0.494796,1.0,0.866667,1.0,0.0,0.0,1.0
1,0.001304,0.22052,0.000314,0.243243,1.0,0.257937,0.064178,0.0,0.295783,0.332378,0.351269,0.388615,0.329904,0.541984,0.366832,0.893939,0.822222,0.909091,0.0,0.0,0.875
2,0.001739,1.0,0.00014,0.32973,0.54717,0.077478,0.0,0.745471,0.760372,0.745429,0.712902,0.288102,0.135635,0.187852,0.237201,0.712121,0.911111,0.454545,0.0,0.0,0.5


# Criando Matrizes esparsas para o resultado

In [12]:
target_column_name = "deal_probability"

start = time.time()

# Matrizes esparsas
matriz_train = sps.hstack(matrizes_train_resultado)
matriz_test  = sps.hstack(matrizes_test_resultado)

# Target
matriz_train_target = sps.csr_matrix(df_train[target_column_name].values.reshape(-1,1), dtype=np.float64, copy = True)

end = time.time()

# liberando memória
df_train = None

print('Tempo de concatenação das matrizes de resultado : '+str("%.2f" % (end - start))+'s\n')

matriz_test

Tempo de concatenação das matrizes de resultado : 0.00s



<32x433 sparse matrix of type '<class 'numpy.float64'>'
	with 1141 stored elements in COOrdinate format>

# Salvando dados processados

In [13]:
start = time.time()
sps.save_npz('sparse_matrix_train_target.npz', matriz_train_target)
sps.save_npz('sparse_matrix_train.npz', matriz_train)
sps.save_npz('sparse_matrix_test.npz' , matriz_test )
end = time.time()

print('Tempo para salvamento dos resultados : '+str("%.2f" % (end - start))+'s\n')

Tempo para salvamento dos resultados : 0.02s

