<a href="https://colab.research.google.com/github/susan291-gifs/SussieAssignment/blob/main/Credit_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb

###Problem 1


*Home Credit strives to broaden financial inclusion for the unbanked population by providing a positive and safe borrowing experience. In order to make sure this underserved population has a positive loan experience, Home Credit makes use of a variety of alternative data--including telco and transactional information--to predict their clients' repayment abilities.

While Home Credit is currently using various statistical and machine learning methods to make these predictions, they're challenging Kagglers to help them unlock the full potential of their data. Doing so will ensure that clients capable of repayment are not rejected and that loans are given with a principal, maturity, and repayment calendar that will empower their clients to be successful.


###Problem 2

###Load Data

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_test.csv')

train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


###Processing Data

In [None]:

missing_values = train.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_values / len(train)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data.head(20)


train.describe()


train['TARGET'].value_counts(normalize=True)


TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

###Problem 4

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_test.csv')

def preprocess_data(df, target=False):
    df = df.fillna(df.median())


le = LabelEncoder()
train['TARGET'] = le.fit_transform(train['TARGET'])


X = train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = train['TARGET']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


X_train_encoded = pd.get_dummies(X_train)
X_val_encoded = pd.get_dummies(X_val)
test_encoded = pd.get_dummies(test.drop(columns=['SK_ID_CURR']))


X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)
X_train_encoded, test_encoded = X_train_encoded.align(test_encoded, join='left', axis=1, fill_value=0)


imputer = SimpleImputer(strategy='mean')
X_train_encoded = imputer.fit_transform(X_train_encoded)
X_val_encoded = imputer.transform(X_val_encoded)
test_encoded = imputer.transform(test_encoded)


model = RandomForestClassifier()
model.fit(X_train_encoded, y_train)


y_pred_proba = model.predict_proba(X_val_encoded)[:, 1]


auc_roc = roc_auc_score(y_val, y_pred_proba)
print(f'AUC-ROC: {auc_roc}')


test['TARGET'] = model.predict_proba(test_encoded)[:, 1]


submission = test[['SK_ID_CURR', 'TARGET']]
submission.to_csv('baseline_submission.csv', index=False)

AUC-ROC: 0.7145717756559136


###Problem 5

In [None]:
def preprocess_data(df, target=False):
    le = LabelEncoder()
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = le.fit_transform(df[column].astype(str))


    df = df.fillna(df.median())


    scaler = StandardScaler()
    for column in df.select_dtypes(include=[np.number]).columns:
        if not target or column != 'TARGET':
            df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

    return df


train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/application_test.csv')


train = preprocess_data(train, target=True)


X = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = train['TARGET']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

###Add Domain Knowledge Features

In [None]:
def add_domain_features(df):
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TERM'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['DAYS_EMPLOYED_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    return df


X_train_domain = add_domain_features(X_train.copy())
X_val_domain = add_domain_features(X_val.copy())


model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train_domain, y_train)
y_val_pred = model.predict_proba(X_val_domain)[:, 1]
auc_roc = roc_auc_score(y_val, y_val_pred)
print(f'AUC-ROC with domain features: {auc_roc}')


AUC-ROC with domain features: 0.7585160665320159


###Aggregation Features

In [None]:
def add_aggregation_features(df):
    df['SUM_EXT_SOURCE'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].sum(axis=1)
    df['MEAN_EXT_SOURCE'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['STD_EXT_SOURCE'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    return df


X_train_agg = add_domain_features(X_train.copy())
X_val_agg = add_domain_features(X_val.copy())


model.fit(X_train_agg, y_train)
y_val_pred = model.predict_proba(X_val_agg)[:, 1]
auc_roc = roc_auc_score(y_val, y_val_pred)
print(f'AUC-ROC with aggregation features: {auc_roc}')

AUC-ROC with aggregation features: 0.7585160665320159


###Interaction Features

In [None]:
def add_interaction_features(df):
    df['EXT_SOURCE_1_2'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2']
    df['EXT_SOURCE_1_3'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_3']
    df['EXT_SOURCE_2_3'] = df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    return df


X_train_interact = add_interaction_features(X_train.copy())
X_val_interact = add_interaction_features(X_val.copy())


model.fit(X_train_interact, y_train)
y_val_pred = model.predict_proba(X_val_interact)[:, 1]
auc_roc = roc_auc_score(y_val, y_val_pred)
print(f'AUC-ROC with interaction features: {auc_roc}')


AUC-ROC with interaction features: 0.7567947812693026
