Test 


In [11]:
import pandas as pd
import json
from datetime import datetime, timedelta

In [12]:
data_df = pd.read_csv('data.csv')

In [13]:
data_df.head()

Unnamed: 0,id,application_date,contracts
0,2925210.0,2024-02-12 19:22:46.652000+00:00,
1,2925211.0,2024-02-12 19:24:29.135000+00:00,"[{""contract_id"": 522530, ""bank"": ""003"", ""summa..."
2,2925212.0,2024-02-12 19:24:41.493000+00:00,
3,2925213.0,2024-02-12 19:24:29.135000+00:00,"[{""contract_id"": 522530, ""bank"": ""003"", ""summa..."
4,2925214.0,2024-02-12 19:24:56.857000+00:00,


In [38]:
# Define constants for feature defaults
NO_CLAIMS_VALUE = -3
NO_LOANS_VALUE = -1
excluded_banks = ['LIZ', 'LOM', 'MKO', 'SUG', None]

data_df['application_date'] = pd.to_datetime(data_df['application_date'], errors='coerce')


In [29]:
# Convert 'application_date' to datetime for consistency
data_df['application_date'] = pd.to_datetime(data_df['application_date'], errors='coerce')

# Step 1: Calculate `tot_claim_cnt_l180d`
tot_claim_cnt_l180d = []
for _, row in data_df.iterrows():
    contracts = row['contracts']
    application_date = row['application_date'].replace(tzinfo=None)  # Remove timezone for consistency
    
    total_claims = NO_CLAIMS_VALUE  # Default if no valid claims
    if pd.notna(contracts):
        try:
            contracts_data = json.loads(contracts)
            contracts_data = [c for c in contracts_data if isinstance(c, dict)]
            recent_claims = [
                contract for contract in contracts_data
                if 'claim_id' in contract and pd.notna(contract.get('claim_date')) and
                (application_date - pd.to_datetime(contract['claim_date']).replace(tzinfo=None)).days <= 180
            ]
            if recent_claims:
                total_claims = len(recent_claims)
        except json.JSONDecodeError:
            pass
    tot_claim_cnt_l180d.append(total_claims)

data_df['tot_claim_cnt_l180d'] = tot_claim_cnt_l180d
print("\nData after `tot_claim_cnt_l180d` calculation:")
display(data_df[['id', 'tot_claim_cnt_l180d']].head())


  (application_date - pd.to_datetime(contract['claim_date']).replace(tzinfo=None)).days <= 180



Data after `tot_claim_cnt_l180d` calculation:


Unnamed: 0,id,tot_claim_cnt_l180d
0,2925210.0,-3
1,2925211.0,60
2,2925212.0,-3
3,2925213.0,60
4,2925214.0,-3


In [30]:
# Step 2: Calculate `disb_bank_loan_wo_tbc`
disb_bank_loan_wo_tbc = []
for _, row in data_df.iterrows():
    contracts = row['contracts']
    loan_sum_wo_excluded = NO_LOANS_VALUE  # Default if no valid loans
    
    if pd.notna(contracts):
        try:
            contracts_data = json.loads(contracts)
            contracts_data = [c for c in contracts_data if isinstance(c, dict)]
            valid_loans = [
                contract for contract in contracts_data
                if contract.get('bank') not in excluded_banks and pd.notna(contract.get('contract_date'))
            ]
            if valid_loans:
                loan_sum_wo_excluded = sum(
                    float(contract['loan_summa']) for contract in valid_loans
                    if isinstance(contract.get('loan_summa'), (int, float, str)) and str(contract['loan_summa']).strip().replace('.', '', 1).isdigit()
                )
        except json.JSONDecodeError:
            pass
    disb_bank_loan_wo_tbc.append(loan_sum_wo_excluded)

data_df['disb_bank_loan_wo_tbc'] = disb_bank_loan_wo_tbc
print("\nData after `disb_bank_loan_wo_tbc` calculation:")
display(data_df[['id', 'disb_bank_loan_wo_tbc']].head())


Data after `disb_bank_loan_wo_tbc` calculation:


Unnamed: 0,id,disb_bank_loan_wo_tbc
0,2925210.0,-1.0
1,2925211.0,0.0
2,2925212.0,-1.0
3,2925213.0,0.0
4,2925214.0,-1.0


In [31]:
day_sinlastloan = []
for _, row in data_df.iterrows():
    contracts = row['contracts']
    application_date = row['application_date'].replace(tzinfo=None)  # Remove timezone
    
    days_since_last_loan = NO_LOANS_VALUE  # Default if no valid loans
    if pd.notna(contracts):
        try:
            contracts_data = json.loads(contracts)
            contracts_data = [c for c in contracts_data if isinstance(c, dict)]
            last_loan_date = max(
                (pd.to_datetime(contract['contract_date']).replace(tzinfo=None) for contract in contracts_data
                 if pd.notna(contract.get('contract_date')) and pd.notna(contract.get('summa'))),
                default=None
            )
            if last_loan_date:
                days_since_last_loan = (application_date - last_loan_date).days
        except json.JSONDecodeError:
            pass
    day_sinlastloan.append(days_since_last_loan)

data_df['day_sinlastloan'] = day_sinlastloan
print("\nData after `day_sinlastloan` calculation:")
display(data_df[['id', 'day_sinlastloan']].head())

  (pd.to_datetime(contract['contract_date']).replace(tzinfo=None) for contract in contracts_data



Data after `day_sinlastloan` calculation:


Unnamed: 0,id,day_sinlastloan
0,2925210.0,-1.0
1,2925211.0,427.0
2,2925212.0,-1.0
3,2925213.0,427.0
4,2925214.0,-1.0


In [33]:
output_path = 'contract_features.csv'
data_df.to_csv(output_path, index=False)


In [34]:
data_df.head()

Unnamed: 0,id,application_date,contracts,tot_claim_cnt_l180d,disb_bank_loan_wo_tbc,day_sinlastloan
0,2925210.0,2024-02-12 19:22:46.652000+00:00,,-3,-1.0,-1.0
1,2925211.0,2024-02-12 19:24:29.135000+00:00,"[{""contract_id"": 522530, ""bank"": ""003"", ""summa...",60,0.0,427.0
2,2925212.0,2024-02-12 19:24:41.493000+00:00,,-3,-1.0,-1.0
3,2925213.0,2024-02-12 19:24:29.135000+00:00,"[{""contract_id"": 522530, ""bank"": ""003"", ""summa...",60,0.0,427.0
4,2925214.0,2024-02-12 19:24:56.857000+00:00,,-3,-1.0,-1.0


In [35]:
feature_columns = ['tot_claim_cnt_l180d', 'disb_bank_loan_wo_tbc', 'day_sinlastloan']
feature_summary = data_df[feature_columns].describe()

# Display the summary statistics
feature_summary

Unnamed: 0,tot_claim_cnt_l180d,disb_bank_loan_wo_tbc,day_sinlastloan
count,1000.0,1000.0,762.0
mean,2.048,768478000.0,94.643045
std,14.322389,3061854000.0,288.052833
min,-3.0,-1.0,-202.0
25%,-3.0,-1.0,-1.0
50%,-3.0,-1.0,-1.0
75%,3.0,0.0,-1.0
max,153.0,32700000000.0,1730.0
