In [10]:
import pandas as pd

# Load the original dataset
df = pd.read_csv("cs-training.csv")

# The original dataset likely has an unnamed index column from the source CSV.
# We will drop it if it exists.
if df.columns[0] == 'Unnamed: 0':
    df.drop(df.columns[0], axis=1, inplace=True)

# Separate by target
df_target_1 = df[df['SeriousDlqin2yrs'] == 1]
df_target_0 = df[df['SeriousDlqin2yrs'] == 0]

# Sample 50 from each class randomly
sample_1 = df_target_1.sample(n=50, random_state=42)
sample_0 = df_target_0.sample(n=50, random_state=42)

# Combine into a new 100-row dataset and shuffle
df_sampled = pd.concat([sample_1, sample_0]).sample(frac=1, random_state=42)

# Relabel the target column
df_sampled['SeriousDlqin2yrs'] = df_sampled['SeriousDlqin2yrs'].map({1: 'high', 0: 'low'})

# Rename the target column to 'Risk' for the output
df_sampled.rename(columns={'SeriousDlqin2yrs': 'Risk'}, inplace=True)

# Reset index to 1–100
df_sampled.reset_index(drop=True, inplace=True)
df_sampled.index = df_sampled.index + 1  # start index at 1

# --- Confirming Column Order and Names ---
# The columns in the DataFrame are now:
# Risk, RevolvingUtilizationOfUnsecuredLines, age, NumberOfTime30-59DaysPastDueNotWorse,
# DebtRatio, MonthlyIncome, NumberOfOpenCreditLinesAndLoans, NumberOfTimes90DaysLate,
# NumberRealEstateLoansOrLines, NumberOfTime60-89DaysPastDueNotWorse, NumberOfDependents

# We will rename the verbose columns to match your desired output
df_sampled.rename(columns={
    'RevolvingUtilizationOfUnsecuredLines': 'Revolving Utilization',
    'age': 'Age',
    'NumberOfTime30-59DaysPastDueNotWorse': '30-59 Days Past Due',
    'DebtRatio': 'Debt Ratio',
    'MonthlyIncome': 'Monthly Income',
    'NumberOfOpenCreditLinesAndLoans': 'Open Credit Lines',
    'NumberOfTimes90DaysLate': '90 Days Late',
    'NumberRealEstateLoansOrLines': 'Real Estate Loans',
    'NumberOfTime60-89DaysPastDueNotWorse': '60-89 Days Past Due',
    'NumberOfDependents': 'Dependents'
}, inplace=True)

# Reorder columns to match the requested sequence
df_sampled = df_sampled[[
    'Risk',
    'Revolving Utilization',
    'Age',
    '30-59 Days Past Due',
    'Debt Ratio',
    'Monthly Income',
    'Open Credit Lines',
    '90 Days Late',
    'Real Estate Loans',
    '60-89 Days Past Due',
    'Dependents'
]]

print("--- Final Column Names (Header Row in CSV) ---")
print("Index (1-100) and then:")
print(list(df_sampled.columns))

# Save to CSV
# 'index=True' ensures the index (1-100) is written as the first column (your 'Index')
# 'header=True' (the default) ensures the column names are written as the first row.
df_sampled.to_csv("Sample-data.csv", index=True)

print("\nSaved 100-row dataset with 50/50 target split, relabeled target, and reindexed 1–100 to Sample-data.csv")

--- Final Column Names (Header Row in CSV) ---
Index (1-100) and then:
['Risk', 'Revolving Utilization', 'Age', '30-59 Days Past Due', 'Debt Ratio', 'Monthly Income', 'Open Credit Lines', '90 Days Late', 'Real Estate Loans', '60-89 Days Past Due', 'Dependents']

Saved 100-row dataset with 50/50 target split, relabeled target, and reindexed 1–100 to Sample-data.csv


In [17]:


# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Print the DataFrame (the previous script's output)
df_sampled

Unnamed: 0,Risk,Revolving Utilization,Age,30-59 Days Past Due,Debt Ratio,Monthly Income,Open Credit Lines,90 Days Late,Real Estate Loans,60-89 Days Past Due,Dependents
1,low,0.530823,60,0,0.523043,11000.0,15,0,3,0,1.0
2,low,0.202477,48,0,0.235627,9200.0,10,0,1,0,0.0
3,low,0.291932,69,0,1262.0,,9,0,1,0,0.0
4,high,1.0,46,0,117.0,,2,0,0,0,0.0
5,high,0.783038,52,1,0.383255,9100.0,10,0,2,0,1.0
6,high,0.139012,34,0,0.161656,7150.0,7,0,1,2,0.0
7,high,1.0,35,0,2.044637,1500.0,3,0,1,0,1.0
8,low,1.0,29,1,233.0,,1,0,0,0,0.0
9,high,1.0,29,0,707.0,,5,0,0,0,0.0
10,high,0.353459,49,0,0.149391,10100.0,5,2,1,0,0.0
