In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv("credit_train.csv",header=0,encoding='unicode_escape',sep=";")
test =pd.read_csv("credit_test.csv",header=0,encoding='unicode_escape',sep=";")

In [3]:
train.head(5)

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count,open_account_flg
0,1,M,48,MAR,UMN,5999800,10,1.6,770249,GRD,ÊÐÀÑÍÎÄÀÐÑÊÈÉ ÊÐÀÉ,30000.0,1.0,1.0,0
1,2,F,28,MAR,UMN,1088900,6,1.1,248514,GRD,ÌÎÑÊÂÀ,43000.0,2.0,0.0,0
2,3,M,32,MAR,SPC,1072800,12,1.1,459589,SCH,ÎÁË ÑÀÐÀÒÎÂÑÊÀß,23000.0,5.0,0.0,0
3,4,F,27,DIV,SPC,1200909,12,1.1,362536,GRD,ÎÁË ÂÎËÃÎÃÐÀÄÑÊÀß,17000.0,2.0,0.0,0
4,5,M,45,MAR,SPC,1690889,10,1.1,421385,SCH,×ÅËßÁÈÍÑÊÀß ÎÁËÀÑÒÜ,25000.0,1.0,0.0,0


In [4]:
test.head(5)

Unnamed: 0,client_id,gender,age,marital_status,job_position,credit_sum,credit_month,tariff_id,score_shk,education,living_region,monthly_income,credit_count,overdue_credit_count
0,170747,F,48,MAR,UMN,1655800,10,1.1,370409,GRD,ÕÀÊÀÑÈß ÐÅÑÏ,34000,2.0,0.0
1,170748,M,29,MAR,SPC,702872,6,1.4,377281,SCH,ÎÁË ÊÅÌÅÐÎÂÑÊÀß,19000,3.0,0.0
2,170749,F,20,UNM,SPC,1301900,10,1.6,538515,SCH,ÎÁË ÁÅËÃÎÐÎÄÑÊÀß,25000,1.0,0.0
3,170750,F,41,MAR,SPC,937900,10,1.1,482520,GRD,ÎÁË ÈÐÊÓÒÑÊÀß,30000,0.0,0.0
4,170751,F,31,MAR,SPC,1399000,6,1.43,485914,GRD,ÎÁË ÊÈÐÎÂÑÊÀß,20000,2.0,0.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170746 entries, 0 to 170745
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             170746 non-null  int64  
 1   gender                170746 non-null  object 
 2   age                   170746 non-null  int64  
 3   marital_status        170746 non-null  object 
 4   job_position          170746 non-null  object 
 5   credit_sum            170746 non-null  object 
 6   credit_month          170746 non-null  int64  
 7   tariff_id             170746 non-null  float64
 8   score_shk             170746 non-null  object 
 9   education             170746 non-null  object 
 10  living_region         170554 non-null  object 
 11  monthly_income        170745 non-null  float64
 12  credit_count          161516 non-null  float64
 13  overdue_credit_count  161516 non-null  float64
 14  open_account_flg      170746 non-null  int64  
dtype

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91940 entries, 0 to 91939
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   client_id             91940 non-null  int64  
 1   gender                91940 non-null  object 
 2   age                   91940 non-null  int64  
 3   marital_status        91940 non-null  object 
 4   job_position          91940 non-null  object 
 5   credit_sum            91940 non-null  object 
 6   credit_month          91940 non-null  int64  
 7   tariff_id             91940 non-null  float64
 8   score_shk             91940 non-null  object 
 9   education             91940 non-null  object 
 10  living_region         91824 non-null  object 
 11  monthly_income        91940 non-null  int64  
 12  credit_count          87237 non-null  float64
 13  overdue_credit_count  87237 non-null  float64
dtypes: float64(3), int64(4), object(7)
memory usage: 9.8+ MB


1) Cleaning the data:
First thing to notice is that "credit_sum" and "score_shk" both compromise of "object" values,therefore we need to change "," to ".",to make them float. Also there some missing values we need to fill in parameters:"living region","monthly income","credit_count" and "overdue_credit_count".

In [7]:
train["credit_sum"]=train["credit_sum"].str.replace(",",".")
train["score_shk"]=train["score_shk"].str.replace(",",".")
test["credit_sum"]=test["credit_sum"].str.replace(",",".")
test["score_shk"]=test["score_shk"].str.replace(",",".")

In [8]:
train["credit_sum"]=train["credit_sum"].astype('float')
train["score_shk"]=(train["score_shk"]).astype('float')
test["credit_sum"]=test["credit_sum"].astype('float')
test["score_shk"]=test["score_shk"].astype('float')

Encoding categorical values into numeric

In [9]:
# Identify most relevant features
# after encoding categorical data below, i checked that "living_region" has the lowest correclation with our target
#I decided not to add it to our model, since it will significatlly slow it down

relevant_features=["gender","age","marital_status","job_position","credit_sum","credit_month","tariff_id","score_shk","education","monthly_income","credit_count","overdue_credit_count"]
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
train[relevant_features] = imputer.fit_transform(train[relevant_features])
test[relevant_features] = imputer.transform(test[relevant_features])


In [10]:
train['gender'] = train['gender'].map({'M': 0, 'F': 1})
test['gender'] = test['gender'].map({'M': 0, 'F': 1})

train["marital_status"] = train["marital_status"].map({'MAR':0, 'DIV':1, 'UNM':2, 'WID':3, 'CIV':4})
test["marital_status"] = test["marital_status"].map({'MAR':0, 'DIV':1, 'UNM':2, 'WID':3, 'CIV':4})

train['job_position'] = train['job_position'].map({'UMN':0, 'SPC':1, 'INP':2, 'DIR':3, 'ATP':4, 'PNA':5, 'BIS':6, 'WOI':7, 'NOR':8,
           'WRK':9, 'WRP':10, 'PNV':11, 'BIU':12, 'PNI':13, 'HSK':14, 'PNS':15, 'INV':16, 'ONB':17})
test['job_position'] = test['job_position'].map({'UMN':0, 'SPC':1, 'INP':2, 'DIR':3, 'ATP':4, 'PNA':5, 'BIS':6, 'WOI':7, 'NOR':8,
       'WRK':9, 'WRP':10, 'PNV':11, 'BIU':12, 'PNI':13, 'HSK':14, 'PNS':15, 'INV':16, 'ONB':17})

train["education"] = train["education"].map({'GRD':0, 'SCH':1, 'UGR':2, 'PGR':3, 'ACD':4})
test["education"] = test["education"].map({'GRD':0, 'SCH':1, 'UGR':2, 'PGR':3, 'ACD':4})

#d={}
#region=train["living_region"].unique()
#for i in range(302):
#    d[region[i]]=i
#train["living_region"]= train["living_region"].map(d)

#b={}
#region2=test["living_region"].unique()
#for i in range(302):
#    b[region[i]]=i
#test["living_region"]= test["living_region"].map(b)


# after looking to correlations I decided not to add "living_rigion" to the "relavent_features", look at the previous cell

In [11]:
# Transform skewed or non-normal features
# Instead of normalizing all of the numeric features, you could try using techniques like log transformation or Box-Cox transformation to make the distribution of a feature more normal
scaler = StandardScaler()
train[relevant_features] = scaler.fit_transform(train[relevant_features])
test[relevant_features] = scaler.transform(test[relevant_features])

In [12]:
X_train = train[relevant_features]
y_train = train['open_account_flg']
X_test = test[relevant_features]

In [13]:
# Split the data into training and cross-validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=33)

In [14]:
log_reg = LogisticRegression()
model = VotingClassifier(estimators=[('lr', log_reg)])
model.fit(X_train, y_train)


param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_log_reg = grid_search.best_estimator_
print("Best parameters for logistic regression: ", grid_search.best_params_)


model = VotingClassifier(estimators=[('lr', best_log_reg)])
model.fit(X_train, y_train)

Best parameters for logistic regression:  {'C': 0.001}


VotingClassifier(estimators=[('lr', LogisticRegression(C=0.001))])

In [15]:
# Evaluate the fine-tuned model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.8256515373352855


In [16]:
# Evaluate the logistic regression classifier
scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Accuracy of logistic regression classifier: ", scores.mean())

Accuracy of logistic regression classifier:  0.8233989309753207


In [17]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [18]:
# Save the predictions to a CSV file
output = pd.DataFrame({'client_id': test['client_id'], 'open_account_flg': y_pred})
output.to_csv('submission.csv', index=False)