In [248]:
#import all necessary packages

import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn import metrics
import hypertools as hyp
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [164]:
pets = pd.read_csv('train.csv')
pets2 = pd.read_csv('test.csv')
petID = pets2.PetID

### Add Create Necessary Functions

#### Create New Column for Pure Bred (Values: 1 if pure bred, 0 if mixed breed.

#### Fetch sentiment analysis values from JSON files and append the values into new columns

In [None]:
def add_pure(dataframe):
    #create new column and set values to 0 by default
    dataframe['pure_bred'] = 0

    #For loop to check if pet is pure bred and change the values accordingly
    for i in range(0,(len(pets))):
        try:
            if dataframe.Breed2[i] == 0 or (dataframe.Breed1[i] == dataframe.Breed2[i]):
                dataframe.iat[i, dataframe.columns.get_loc('pure_bred')] = 1
        except:
            continue
    return (dataframe)
    #pets.pure_bred.value_counts(dropna = False)
#'../input/test/test_sentiment/'
def add_sentiments(dataframe, path):
    dataframe['sentiment_score'] = float(0)
    dataframe['sentiment_magnitude'] = float(0)

    for i in range(0,(len(dataframe))):
        filename = path + dataframe.PetID[i] + '.json'
        try:
            with open(filename, 'r') as f:
                sentiments = json.load(f)
                dataframe.iat[i, dataframe.columns.get_loc('sentiment_score')] = sentiments['documentSentiment']['score']
                dataframe.iat[i, dataframe.columns.get_loc('sentiment_magnitude')] = sentiments['documentSentiment']['magnitude']
        except FileNotFoundError:
            continue
    return (dataframe)
    #dataframe.sentiment_score.value_counts(dropna = False)
    
def create_cats(dataframe):
    for i in dataframe.columns:
        #if i is not dataframe.columns[2] and i is not dataframe.columns[10] and i is not dataframe.columns[15] and i is not dataframe.columns[18] and i is not dataframe.columns[22] and i is not dataframe.columns[25] and i is not dataframe.columns[26]:
        if i not in ['Age', 'FurLength', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'sentiment_score', 'sentiment_magnitude']:
            dataframe[i] = dataframe[i].astype('category')

def sentiment_to_float(dataframe):
    dataframe.sentiment_score = dataframe.sentiment_score.astype(float)
    dataframe.sentiment_magnitude = dataframe.sentiment_magnitude.astype(float)
    
def fillna(dataframe):
    for i in dataframe.columns:
        if i not in ['Name', 'Description', 'PetID', 'RescuerID']:
            if i not in ['Age', 'Quantity', 'PhotoAmt', 'VideoAmt', 'sentiment_score', 'sentiment_magnitude', 'Fee']:
                dataframe[i].fillna(dataframe[i].mode()[0], inplace = True)
            else:
                dataframe[i].fillna(dataframe[i].mean(), inplace = True)

def find_weights(dataframe): 
    weights = []
    count = 0
    for i in range(0,5):
        count = 0
        weight = 0.0
        for j in dataframe.AdoptionSpeed:
            if j == i:
                count += 1
        weight = count/len(dataframe.AdoptionSpeed)
        weights.append(weight)
    return weights

In [None]:
add_pure(pets)
add_pure(pets2)
add_sentiments(pets, '../input/train_sentiment/')
add_sentiments(pets2, '../input/test_sentiment/')
sentiment_to_float(pets)
sentiment_to_float(pets2)
fillna(pets)
fillna(pets2)
create_cats(pets)
create_cats(pets2)

In [None]:
fillna(pets)
fillna(pets2)
create_cats(pets)
create_cats(pets2)

pets = pets.drop(['Name', 'PetID', 'Description','RescuerID'], axis = 1)
pets = pets.dropna()

pets2 = pets2.drop(['Name', 'PetID', 'Description','RescuerID'], axis = 1)
pets2 = pets2.dropna()

In [None]:
sns.countplot(x='AdoptionSpeed', data = pets)

In [None]:
pets.groupby('AdoptionSpeed').mean()

### Perform Best Subset Selection

In [None]:
pets_vars = pets.columns.tolist()
adopt = ['AdoptionSpeed']
X=[i for i in pets_vars if i not in adopt]

os_data_X = pd.DataFrame(data=pets[X],columns=X )
os_data_y= pd.DataFrame(data=pets[adopt],columns=adopt)

logreg = LogisticRegression()
rfe = RFE(logreg)
rfe = rfe.fit(pets[X], pets[adopt].values.ravel())

In [None]:
print(rfe.support_)
print(rfe.ranking_)

In [None]:
new_X = []
for i in range(0,len(X)):
    if str(rfe.support_[i]) is not 'False':
        new_X.append(str(X[i]))
new_X

In [None]:
y = pets[adopt]
new_XX = np.asarray(pets[new_X])
test_X = np.asarray(pets2[new_X])

In [None]:
k=find_weights(pets)

In [None]:
model = XGBClassifier(sample_weight = k)
model.fit(new_XX,y)
pred_y = model.predict(test_X)

pred_y = np.round(pred_y)
pred_y = pred_y.astype(int)

In [None]:
#for i in range(0,len(y_pred)-1):
#    print(pets3.PetID[i], int(y_pred[i]))

In [None]:
df = {'PetID': petID, 'AdoptionSpeed': y_pred}
df = pd.DataFrame(df)
df.AdoptionSpeed = df.AdoptionSpeed.astype('int32')

In [None]:
df.to_csv('submission.csv', index = False)