In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import concordance_index_censored
from google.oauth2 import service_account
import pandas_gbq as gbq


In [2]:
# df = pd.read_csv('dataset/X_Y_merged.csv')
# df.head()
credentials_path = 'token.json'

# Authenticate with your credentials
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/bigquery'])

# Set the credentials for pandas_gbq
gbq.context.credentials = credentials

project_id = 'capstone-398012'
dataset_id = 'capstone_final'
table_id = "CRI_Compustat_Merged_785k"

from pandas_gbq import read_gbq

query = f"""
SELECT *
FROM `{project_id}.{dataset_id}.{table_id}`

"""

# Authenticate and read data from BigQuery into a DataFrame
df = read_gbq(query, project_id=project_id, dialect='standard')
df.head()


Downloading: 100%|[32m██████████[0m|


Unnamed: 0,CompanyNumber,yyyy,mm,DTDmedianFin,DTDmedianNonFin,dummy297fin,EventDate,EventDate_string,Duration,StartDate,...,equity_ratio,financial_leverage_ratio,cashflow_to_debt_ratio,net_profit_margin,asset_turnover,receivables_turnover,day_sales_outstanding,working_capital_turnover,price_to_earnings,retention_ratio
0,26978,2000.0,9,0.0,2.197684,0.0,2000-09-01 00:00:00+00:00,2000 09,4624 days 00:00:00,1988-01-04 00:00:00+00:00,...,0.787597,1.269685,0.199624,0.301955,0.364522,1.417719,0.705358,2.874451,13.663176,21.594346
1,27012,2000.0,7,0.0,2.190358,0.0,2000-07-01 00:00:00+00:00,2000 07,3559 days 00:00:00,1990-10-03 00:00:00+00:00,...,1.11911,0.893567,0.115897,0.534346,0.552748,1.227325,0.867594,2.221425,67.372881,16.676442
2,27029,2000.0,10,0.0,2.054456,0.0,2000-10-01 00:00:00+00:00,2000 10,4654 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.558682,0.641568,0.202217,0.416697,0.531928,1.848647,0.540936,1.808307,16.601667,15.683041
3,27088,2000.0,11,0.0,1.885232,0.0,2000-11-01 00:00:00+00:00,2000 11,3502 days 00:00:00,1991-04-01 00:00:00+00:00,...,5.154506,0.194005,0.227931,0.318447,0.344712,3.410596,0.293204,-20.6,19.843478,29.95
4,27123,2000.0,10,0.0,2.054456,0.0,2000-10-01 00:00:00+00:00,2000 10,4654 days 00:00:00,1988-01-04 00:00:00+00:00,...,1.145114,0.873276,0.080033,0.618348,0.523327,0.644906,1.251853,1.730681,216.86747,13.115052


In [32]:
# df['transformed_status'] = df['EventType'].map(lambda x : 1 if x == 1 else 0)
# df['transformed_status'] = df['EventType']

# Data preprocessing
df['transformed_year'] = df['yyyy'].apply(lambda x: x - 2000)
df['transformed_status'] = df['EventType'].map(lambda x : True if x == 0.0 else False)

X = df.drop(['yyyy','mm','EventDate', 'EventDate_string', 'Duration', 'StartDate', 'EventType', 'datadate', 'transformed_year', 'transformed_status'], axis=1)  # Features

# X = df.drop(['EventDate_notNA','EventType', 'Sector_Number_notNA', 'Exchange_notNA'], axis=1)  # Features
time_column = df['transformed_year'].values
event_column = df['transformed_status'].values
y = np.array(list(zip(event_column,time_column)), dtype=[('transformed_status', bool), ('transformed_year', int)])

df.head()

Unnamed: 0,CompanyNumber,yyyy,mm,DTDmedianFin,DTDmedianNonFin,dummy297fin,EventDate,EventDate_string,Duration,StartDate,...,net_profit_margin,asset_turnover,receivables_turnover,day_sales_outstanding,working_capital_turnover,price_to_earnings,retention_ratio,transformed_year,status_boolean,transformed_status
0,26978,2000.0,9,0.0,2.197684,0.0,2000-09-01 00:00:00+00:00,2000 09,4624 days 00:00:00,1988-01-04 00:00:00+00:00,...,0.301955,0.364522,1.417719,0.705358,2.874451,13.663176,21.594346,0.0,False,True
1,27012,2000.0,7,0.0,2.190358,0.0,2000-07-01 00:00:00+00:00,2000 07,3559 days 00:00:00,1990-10-03 00:00:00+00:00,...,0.534346,0.552748,1.227325,0.867594,2.221425,67.372881,16.676442,0.0,False,True
2,27029,2000.0,10,0.0,2.054456,0.0,2000-10-01 00:00:00+00:00,2000 10,4654 days 00:00:00,1988-01-04 00:00:00+00:00,...,0.416697,0.531928,1.848647,0.540936,1.808307,16.601667,15.683041,0.0,False,True
3,27088,2000.0,11,0.0,1.885232,0.0,2000-11-01 00:00:00+00:00,2000 11,3502 days 00:00:00,1991-04-01 00:00:00+00:00,...,0.318447,0.344712,3.410596,0.293204,-20.6,19.843478,29.95,0.0,False,True
4,27123,2000.0,10,0.0,2.054456,0.0,2000-10-01 00:00:00+00:00,2000 10,4654 days 00:00:00,1988-01-04 00:00:00+00:00,...,0.618348,0.523327,0.644906,1.251853,1.730681,216.86747,13.115052,0.0,False,True


In [33]:
# Data preprocessing
# X = df.drop(['yyyy', 'EventDate_notNA','Event_type', 'Sector_Number_notNA', 'Exchange_notNA', 'transformed_status','transformed_year', 'status_boolean'], axis=1)  # Features
#df_new = df.copy()
# df_new.dropna(inplace=True)

#X = df_new.drop(['EventDate','EventType','transformed_year', 'status_boolean', 'yyyy', 'mm'], axis=1)  # Features
#y = np.array(list(zip(df_new['status_boolean'], df_new['transformed_year'])), dtype=[('event', bool), ('time', int)])

# encoder = OneHotEncoder()
# X_encoded = encoder.fit_transform(X)

# Imputing Nan
# X_encoded = X_encoded.fillna(0)

# Scaling
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
print(y_train.dtype)


[('transformed_status', '?'), ('transformed_year', '<i4')]


In [36]:
# Survival model - Cox Proportional Hazards (CoxPH) model
estimator = CoxPHSurvivalAnalysis()
estimator.fit(X_train, y_train)


  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(
  delta = solve(


In [38]:
prediction = estimator.predict(X_test)

c_index = estimator.score(X_test, y_test)
print(f"Concordance Index (C-index): {c_index:.4f}")

  concordant += n_con


Concordance Index (C-index): 0.6647
