# Inteligência Computacional

## Projeto DATA MINING CUP Competition 2013

In [427]:
import pandas as pd

from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve

In [428]:
data = pd.read_csv('transact_train.csv')

## Tratando Granularidade e Criando Variáveis

### Agrupando Dados
agrupando todos os dataframes separados por sessionNo em um vetor

In [429]:
group = data["sessionNo"]
agg = data.groupby([group])
splited = []
for group in agg:
    splited.append(pd.DataFrame(data = group[1], columns=data.columns.values))

In [430]:
# vetor de dataframes separados por sessionNo
splited[2]

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
8,3,6,5,181477.0,9,29.99,29.99,89.97,1,29.99,...,?,3,1800,475,302,12,45,1,11,y
9,3,6,5,297018.0,11,9.99,29.99,109.95,2,9.99,...,?,3,1800,475,302,12,45,1,11,y
10,3,6,5,310967.0,11,9.99,29.99,109.95,2,9.99,...,completely orderable,3,1800,475,302,12,45,1,11,y
11,3,6,5,324278.0,11,9.99,29.99,109.95,2,9.99,...,completely orderable,3,1800,475,302,12,45,1,11,y
12,3,6,5,341613.0,11,9.99,29.99,109.95,2,9.99,...,completely orderable,3,1800,475,302,12,45,1,11,y


### Criando Variáveis da mudança de granularidade
Essas são as variáveis que criamos buscando recuperar dados perdidos na mudança da granularidade
##### bStep_count
    Quantidade de bStep maiores que 3 de cada sessão
##### countLog
    Numero de log de cada sessão
##### modes
    Moda do bStep de cada Sessão.

In [431]:
bStep_count = []
countLog = []
modes = []
for session in splited:
    countStep = 0
    countLog.append(len(session))
    session = session[session['bStep'] != '?']
    if(len(session) == 0):
        mode = '?'
    else:
        mode = session['bStep'].mode()[0]
    modes.append(mode)
    for j in range(len(session)):
        if(session['bStep'].iloc[j] != "?" and int(session['bStep'].iloc[j]) >= 3):
            countStep +=1
    bStep_count.append(countStep)

In [432]:
#bStep_count
#countLog
#modes

### Mudando Granularidade

In [433]:
#Mudando a granularidade
data = data.groupby(['sessionNo'], as_index=False).last()

### Criando Variáveis de Correlação

As técnicas de inteligência artificial pressupõem que as variáveis não têm correlação entre si, apenas relação direta com a variável alvo.

Essas são as variáveis que criamos buscando informar uma correlação entre variáveis
##### diffCounts
    diferença entre cCount e bCount
##### durCount
    Razão entre a duração e o Ccount

In [434]:
# diffCounts
diffCounts = (data['cCount']-data['bCount']).values

# durCount
durCount = round(data['duration']/data['cCount'],2)
durCount = durCount.replace(float("inf"), "?", regex=False).values
# Testar depois com outros valores para tratar o inf

In [435]:
#diffCounts

### Atribuindo numeros às classes
Buscando melhorar o processamento, transformamos todos os nomes de classe para numeros, numa sequencia de 0 ao numero de classes -1 para cada classe diferente.

In [436]:
##Ver os valores diferentes de availability
#data.groupby(['availability'], as_index=False).last()

##Ver os valores diferentes de onlineStatus
#data.groupby(['onlineStatus'], as_index=False).last()

##Ver os valores diferentes de order
#data.groupby(['order'], as_index=False).last()

In [437]:
# availability
data.loc[(data.availability == 'completely not determinable'),'availability']= 0
data.loc[(data.availability == 'completely not orderable'),'availability']= 1
data.loc[(data.availability == 'completely orderable'),'availability']= 2
data.loc[(data.availability == 'mainly not determinable'),'availability']= 3
data.loc[(data.availability == 'mainly not orderable'),'availability']= 4
data.loc[(data.availability == 'mainly orderable'),'availability']= 5
data.loc[(data.availability == 'mixed'),'availability']= 6

# onlineStatus
data.loc[(data.onlineStatus == 'y'),'onlineStatus']= 1
data.loc[(data.onlineStatus == 'n'),'onlineStatus']= 0

# order
data.loc[(data.order == 'y'),'order']= 1
data.loc[(data.order == 'n'),'order']= 0

### Inserindo as colunas das variáveis criadas
Com as variáveis criadas, adicionamos ao conjunto de dados com a granularidade já alterada.

In [438]:
#Inserindo colunas de variáveis no conjunto de dados total

#bStep_count
data.insert(23, "bStep_count", bStep_count , True) 

#countLog
data.insert(24, "countLog", countLog , True)

#modes
data.insert(25, "modes", modes , True) 

#diffCounts
data.insert(26, "diffCounts", diffCounts , True) 

#durCount
data.insert(27, "durCount", durCount, True) 

In [439]:
data.drop('customerNo', inplace=True, axis=1)

In [440]:
data = data.fillna(value=0)

### Tratando Missing Values

In [441]:
backupData = deepcopy(data) #data é minimo com moda
data2 = deepcopy(data) # minimo com New
data3 = deepcopy(data) # maximo com moda
data4 = deepcopy(data) # maximo com New
data5 = deepcopy(data) # media com moda
data6 = deepcopy(data) # media com New
data7 = deepcopy(data) # exclusão com moda
data8 = deepcopy(data) # exclusão com New

In [442]:
#definindo funções para variáveis numéricas:
def replaceMissingbyMin(data, field):
    if(field == 'durCount'):
        mini = data['durCount'].mask(data['durCount'] == '?').min()
    else:
        mini = data[field].min()
    data[field] = data[field].replace('?', mini, regex=False)
    
def replaceMissingbyMax(data, field):
    maxi = data[data[field]!="?"][field].max()
    data[field] = data[field].replace('?', maxi, regex=False)

def replaceMissingbyMedian(data, field):
    median = data[data[field]!="?"][field].median()
    data[field] = data[field].replace('?', median, regex=False)
    
def removeMissing(data, field):
    data[field] = data[field].replace('?', float("NaN"))
    data.dropna(axis='index', inplace = True)
    #for i in range(len(data)):
    #    row = data.iloc[i]
    #    print(i)
    #    if(row[field] == '?'):
    #        data.drop([i], axis=0)
     #retorna o dataframe onde não tem nenhuma '?'
    

In [443]:
#Tratando os missing values numéricos nos dataframes:
varsNum = ['cMinPrice', 'cMaxPrice', 'cSumPrice', 'bMinPrice', 'bMaxPrice', 'bSumPrice', 'bStep', 'maxVal', 
           'customerScore', 'accountLifetime', 'payments', 'age', 'lastOrder',"bStep_count", "modes" , "countLog" , "diffCounts", "durCount"]

for var in varsNum:
    replaceMissingbyMin(data, var)
    replaceMissingbyMin(data2, var)
    replaceMissingbyMax(data3, var)
    replaceMissingbyMax(data4, var)
    replaceMissingbyMedian(data5, var)
    replaceMissingbyMedian(data6, var)
    removeMissing(data7, var)
    removeMissing(data8, var)

  result = method(y)


In [444]:
#definindo funções para variáveis categóricas:
def replaceMissingbyMode(data, field):
    mode = data[field].mode()
    if((mode == '?').bool()):
        mode = data[field].value_counts().index[1]
    data[field] = data[field].replace('?', mode[0], regex=False)
    
def replaceMissingbyNew(data, field):
    data[field] = data[field].replace('?', -1, regex=False)

In [445]:
categories = ['onlineStatus', 'availability', 'address']
for category in categories:
    replaceMissingbyMode(data, category)
    replaceMissingbyMode(data3, category)
    replaceMissingbyMode(data5, category)
    replaceMissingbyMode(data7, category)
    replaceMissingbyNew(data2, category)
    replaceMissingbyNew(data4, category)
    replaceMissingbyNew(data6, category)
    replaceMissingbyNew(data8, category)

In [446]:
#Salva a tabela
datas = [data, data2, data3, data4, data5, data6, data7, data8]
data = deepcopy(backupData)

In [447]:
data.isnull().any()

sessionNo          False
startHour          False
startWeekday       False
duration           False
cCount             False
cMinPrice          False
cMaxPrice          False
cSumPrice          False
bCount             False
bMinPrice          False
bMaxPrice          False
bSumPrice          False
bStep              False
onlineStatus       False
availability       False
maxVal             False
customerScore      False
accountLifetime    False
payments           False
age                False
address            False
lastOrder          False
bStep_count        False
countLog           False
modes              False
diffCounts         False
durCount           False
order              False
dtype: bool

In [448]:
datas[7]

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,payments,age,address,lastOrder,bStep_count,countLog,modes,diffCounts,durCount,order
11,12,6,5,1540895.00,21,5.99,59.99,456.77,11,5.99,...,11,86,2,37,9,27,1,10,73375.95,1
19,20,6,5,7647.51,11,49.99,499.99,2199.91,1,49.99,...,11,58,2,61,5,8,4,10,695.23,0
20,21,6,5,152401.00,3,14.99,14.99,44.97,1,14.99,...,34,51,2,23,3,6,1,2,50800.33,1
27,28,6,5,660644.00,23,14.99,33.99,617.77,3,24.99,...,8,31,2,109,3,9,1,20,28723.65,1
31,32,6,5,454355.00,5,139.99,139.99,699.95,1,139.99,...,1,53,2,22,5,13,2,4,90871.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49979,49980,18,7,3883236.00,10,8.95,11.9,71.13,3,8.95,...,11,55,1,62,0,4,1,7,388323.60,0
49986,49987,18,7,336.39,13,19.99,199.99,239.97,1,19.99,...,29,43,2,7,3,5,1,12,25.88,1
49989,49990,18,7,228902.00,8,9.99,19.99,115.92,8,9.99,...,13,41,2,41,3,13,1,0,28612.75,1
49992,49993,18,7,4113213.00,69,9.99,24.99,971.31,15,9.99,...,0,54,2,45,0,44,2,54,59611.78,0


In [449]:
#Finish

In [450]:
datas[0].to_csv(r'datas_0.csv');

### Avaliando conjunto de Dados

In [451]:
def avaliadorRandomForrest(data):
    # Separar os dados em entrada (X) e saida (Y)
    Y = data.iloc[:,-1]
    X = data.iloc[:,0:-1]
    
    # Dividir os dados em treinamento e teste
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.40, random_state=42)
    
    # Criar e Treinar modelo
    forest = RandomForestClassifier(n_estimators=10,max_features=5,criterion="entropy")
    temp = forest.fit(X_train, Y_train)
    
    # Avaliar modelo
    Y_predict = forest.predict(X_test)
    return roc_curve(Y_test, Y_predict)

In [452]:
Y = datas[0].iloc[:,-1]
X = datas[0].iloc[:,0:-1]

In [453]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.40, random_state=42)

In [454]:
forest = RandomForestClassifier(n_estimators=10,max_features=5,criterion="entropy")

In [455]:
X_train.isnull().any()

sessionNo          False
startHour          False
startWeekday       False
duration           False
cCount             False
cMinPrice          False
cMaxPrice          False
cSumPrice          False
bCount             False
bMinPrice          False
bMaxPrice          False
bSumPrice          False
bStep              False
onlineStatus       False
availability       False
maxVal             False
customerScore      False
accountLifetime    False
payments           False
age                False
address            False
lastOrder          False
bStep_count        False
countLog           False
modes              False
diffCounts         False
durCount           False
dtype: bool

In [456]:
#X = X_train.drop(columns=['durCount'])
#X_train.isnull().any()

In [457]:
temp = forest.fit(X_train, Y_train)