<a href="https://colab.research.google.com/github/wonjae124/python/blob/main/Adoption_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from datetime import datetime

In [16]:
train = pd.read_csv('/content/gdrive/MyDrive/animal/train.csv.gz')
print("Shape: ", train.shape)
train.head()

Shape:  (26729, 10)


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [17]:
test = pd.read_csv('/content/gdrive/MyDrive/animal/test.csv.gz')
print("Shape: ", test.shape)
test.head()

Shape:  (11456, 8)


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [18]:
sample = pd.read_csv('/content/gdrive/MyDrive/animal/sample_submission.csv.gz')
print("Shape: ",sample.shape)
sample.head()

Shape:  (11456, 6)


Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,1,0,0,0,0
1,2,1,0,0,0,0
2,3,1,0,0,0,0
3,4,1,0,0,0,0
4,5,1,0,0,0,0


In [19]:
Counter(train['OutcomeType'])

Counter({'Adoption': 10769,
         'Died': 197,
         'Euthanasia': 1555,
         'Return_to_owner': 4786,
         'Transfer': 9422})

In [20]:
Counter(train['Name']).most_common(5)

[(nan, 7691), ('Max', 136), ('Bella', 135), ('Charlie', 107), ('Daisy', 106)]

# Data PreproCessing
OutcomeSubtype column seems to be of no use, so we drop it. Also, since animal ID is unique, it doesn't help in training

In [22]:
train_X = train.drop(columns = ['OutcomeType','OutcomeSubtype','AnimalID']) #버릴 column
Y = train['OutcomeType']
test_X = test

In [24]:
test_X.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [25]:
train_X.head()

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


Stacking train and test set so that they undergo the same preprocessing


In [27]:
stacked_df = train_X.append(test_X.drop(columns=['ID']))

In [28]:
stacked_df.head()

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


splitting datetime into month and year


In [29]:
stacked_df = stacked_df.drop(columns=['DateTime'])
stacked_df.head()

Unnamed: 0,Name,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


dropping columns with too many nulls


In [30]:
for col in stacked_df.columns:
  if stacked_df[col].isnull().sum() > 10000:
    print("dropping", col, stacked_df[col].isnull().sum())
    stacked_df = stacked_df.drop(columns=[col])

dropping Name 10916


# label encoding


In [32]:
for col in stacked_df.columns :
  if stacked_df.dtypes[col] == "object":
    stacked_df[col] = stacked_df[col].fillna("NA") # ?
  else:
    stacked_df[col] = stacked_df[col].fillna(0)
  stacked_df[col] = LabelEncoder().fit_transform(stacked_df[col])

In [35]:
stacked_df.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,3,5,1482,146
1,0,4,5,775,184
2,1,3,21,1293,97
3,0,1,26,775,47
4,1,3,21,1101,311


In [36]:
for col in stacked_df.columns:
  stacked_df[col] = stacked_df[col].astype('category')

splitting back train and test


In [38]:
X = stacked_df[0:26729] #이게 무슨 뜻이지?
test_processed = stacked_df[26729:]

print("train shape: ", X.shape, "original", train.shape)
print("test shape: ", test_processed.shape, "original", test.shape)

train shape:  (26729, 5) original (26729, 10)
test shape:  (11456, 5) original (11456, 8)


Encoding target

In [39]:
Y = LabelEncoder().fit_transform(Y)

print(Counter(train['OutcomeType']))
print(Counter(Y))
target_dict = {
    'Return_to_owner' : 3,
    'Euthanasia' : 2,
    'Adoption' : 0,
    'Died' : 1
}

Counter({'Adoption': 10769, 'Transfer': 9422, 'Return_to_owner': 4786, 'Euthanasia': 1555, 'Died': 197})
Counter({0: 10769, 4: 9422, 3: 4786, 2: 1555, 1: 197})


train-valid split


In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.10, random_state=0)
X_train.head()

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
6917,1,3,5,1293,146
13225,0,4,33,1515,231
2697,1,4,5,1353,43
21905,1,3,31,245,40
17071,0,4,37,775,156


Choosing columns for embedding


In [41]:

embedded_cols = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories)>2} # 여기서 n뒤에 콜론이 있는 이유?
embedded_cols

{'AgeuponOutcome': 46, 'Breed': 1678, 'Color': 411, 'SexuponOutcome': 6}

Determining size of embedding

In [43]:
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()] #?
embedding_sizes

[(6, 3), (46, 23), (1678, 50), (411, 50)]

In [44]:
class ShelterOutcomeDataset(Dataset):
  def __init__(self, X, Y, embedded_col_names):
    X = X.copy()
    self.X1 = X.loc[:, embedded_col_names].copy().values.astype(np.int64) #categorical columns
    self.X2 = X.drop(columns=embedded_col_names).copy().values.astype

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X1[idx], self.X2[idx], self.y[idx]
