In [32]:
# Import necessary libraries
import pandas as pd        # For data manipulation and analysis
import numpy as np         # For numerical operations
import warnings            # To suppress warnings

# Visualization libraries
import matplotlib.pyplot as plt  # For plotting
plt.style.use('ggplot')          # Use ggplot style for consistent plots
import seaborn as sns            # Advanced statistical visualization

# Scientific & Statistical libraries
import scipy as sp               # Scientific computations
import statsmodels.api as sm     # Statistical modeling, regression, hypothesis testing

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
def feature_summary(data, feature):
    """
    Bir DataFrame içindeki belirli bir sütun hakkında özet bilgi döner.
    - Eksik değer sayısı
    - String ve sayısal değerlerin sayısı
    - Sayısal değerler için Q1, Q2 (Medyan), Q3
    
    Parameters:
        data (pd.DataFrame): Giriş DataFrame'i
        feature (str): İncelenecek sütun adı
    
    Returns:
        dict: Hesaplanan özet bilgileri içeren sözlük
    """
    # Eksik değer sayısı
    missing_values = data[feature].isna().sum()
    
    # String ve sayısal değerlerin sayısı
    string_count = data[feature].apply(lambda x: isinstance(x, str)).sum()
    numeric_count = data[feature].apply(lambda x: isinstance(x, (int, float, np.number))).sum()
    
    # Sayısal değerler için Q1, Q2 (Medyan) ve Q3
    numeric_values = pd.to_numeric(data[feature], errors='coerce')  # Sayısal değerlere dönüştür
    q1 = numeric_values.dropna().quantile(0.25)  # 1. Çeyrek
    q2 = numeric_values.dropna().median()       # Medyan
    q3 = numeric_values.dropna().quantile(0.75) # 3. Çeyrek
    
    # Sonuçları sözlük olarak döndür
    summary = {
        'Missing Values': missing_values,
        'String Count': string_count,
        'Numeric Count': numeric_count,
        'Q1 (25%)': q1,
        'Q2 (Median)': q2,
        'Q3 (75%)': q3
    }
    
    return summary


In [39]:
df = pd.read_csv('rawdata.csv')


In [42]:
# Sayısal dönüşüm denemesi
numeric_converted = pd.to_numeric(df['YearsOfEducation'], errors='coerce')

# Sayısala dönüşemeyen değerler NaN olarak gelir, bunları orijinal değerlerle dolduruyoruz
df['YearsOfEducation'] = numeric_converted.fillna(df['YearsOfEducation'])
import re 

education_map = {
    'highschool': 12,
    'hs': 12,
    'ged': 12,
    'college': 14,
    'some college': 15,
    'associates': 14,
    'bachelors': 16,
    "bachelor's": 16,
    'bsn': 16,
    'masters': 18,
    'mba': 18,
    'pharmd': 18,
    'postgraduate': 18,
    'phd': 20,
    'doctorate': 20,
    'md': 20,
    'jd': 20,
    'np': 20,
    'engineer': 18,
}

# Fonksiyon: Yıl tahmini
def estimate_years(value):
    # 16+ gibi değerler için
    if re.match(r'^\d+\+$', str(value)):
        return int(value.strip('+')) + 2
    # 12 yrs, 14.5+ gibi detaylı formatlar
    elif re.match(r'^\d+\.?\d*\s*yrs?', str(value)):
        return float(re.search(r'\d+\.?\d*', value).group())
    # Sözlükten tahminler
    elif isinstance(value, str):
        value_lower = value.lower()
        for key, year in education_map.items():
            if key in value_lower:
                return year
        # Varsayılan tahminler
        if 'school' in value_lower:
            return 12
        if 'college' in value_lower or 'university' in value_lower:
            return 14
    return None  # Bilinmeyen değerler için

# Tahmin edilen yılları orijinal sütuna yaz
df.loc[df['YearsOfEducation'].apply(lambda x: isinstance(x, str)), 'YearsOfEducation'] = (
    df['YearsOfEducation'].apply(estimate_years)
)

{'Missing Values': 71,
 'String Count': 0,
 'Numeric Count': 32886,
 'Q1 (25%)': 1.0,
 'Q2 (Median)': 2.0,
 'Q3 (75%)': 3.0}

In [31]:
for i in df[df['YearsOfEducation'].apply(lambda x: isinstance(x, str))].YearsOfEducation.values:
    print(i)

Unnamed: 0,MaternalPatientID,MaternalMedicalRecordNumber,ChildPatientID,ChildMedicalRecordNumber,ChildEncounterID,MaternalAgeAtDelivery,MaternalRace,MaternalLanguage,MyChartAccess,LatestMaritalStatus,...,MaxBloodGlucose,MinSerumBilirubin,MaxSerumBilirubin,MaxTranscutaneousBilirubin,MinTranscutaneousBilirubin,Phototherapy,FailedHearing,CarSeatTestFailures,FeedingType,PercentWeightLoss
1686,4408736313,84007115,44010635056,88201706,440562000000.0,25,Black race,English Language,Y,Unmarried person,...,119.0,,,4.4,3.9,N,N,,Formula,-4.2
8561,44010516276,88083472,44010879319,88446504,440593000000.0,26,Caucasian,English Language,Y,Currently Married,...,,6.0,6.6,9.1,6.9,N,N,,Human Milk+Formula,-2.8
11443,140Z2251351,34469318,4408642191,83903025,440343000000.0,38,Asians,English Language,Y,Currently Married,...,66.0,,,6.9,5.7,N,N,0.0,Human Milk,-5.4
19742,1405207434,73616280,4409736942,87386099,440465000000.0,29,Black race,English Language,Y,Unmarried person,...,80.0,,,9.9,5.4,N,N,,Human Milk,-6.1
21374,140Z3122166,34137781,44010863436,88430596,440590000000.0,36,Caucasian,English Language,Y,domestic partner,...,87.0,7.7,7.7,8.2,6.9,N,N,,Human Milk,-4.6


In [141]:

df['ChildReadmissionFirstMonth'].value_counts()


ChildReadmissionFirstMonth
N    32664
Y      222
Name: count, dtype: int64

In [139]:
feature_summary(data=df, feature='MultipleGestation')

{'Missing Values': 0,
 'String Count': 32886,
 'Numeric Count': 0,
 'Q1 (25%)': nan,
 'Q2 (Median)': nan,
 'Q3 (75%)': nan}

In [136]:
na_sums = df[df['ChildReadmissionFirstMonth'] == 'Y'].isna().sum()

# Keep only those with counts greater than 0
na_sums_filtered = na_sums[na_sums > 0]
(na_sums_filtered/2.22).round(2)

YearsOfEducation                    94.14
Parity                              18.02
ChildDeathDate                      99.10
MaternalDeathDate                  100.00
DeliveryType                         1.35
Mother_BMI_AtDelivery               18.02
PrePregnancyWeight_KG               28.38
HepB_PositiveDuringPregnancy         1.35
HCV_PositiveDuringPregnancy         73.42
HIV_PositiveDuringPregnancy          1.35
SyphilisPositiveDuringPregnancy      7.66
GBS_PositiveDuringPregnancy         23.87
Apgar1Minute                         0.45
Apgar5Minute                         0.45
Cord_pH_Venous_Date                 77.03
Cord_pH_Venous                      77.03
Cord_pH_Arterial_Date               41.89
Cord_pH_Arterial                    41.89
MinBloodGlucose                     54.05
MaxBloodGlucose                     54.05
MinSerumBilirubin                   54.95
MaxSerumBilirubin                   54.95
CarSeatTestFailures                 81.53
FeedingType                       

In [None]:
# High risk data
'MaternalPatientID', 'MaternalMedicalRecordNumber', 'ChildPatientID','ChildMedicalRecordNumber', 'ChildEncounterID', 
# Clinic Data
# Maternal Demographic Characteristics
'MaternalAgeAtDelivery','MaternalRace', 'MaternalLanguage', 
'MyChartAccess',
'LatestMaritalStatus', 'YearsOfEducation', 'Gravida', 'Parity',
'DeliveryDate', 'GestationalAge_Weeks', 'GestAge_Days',
'ChildDeathDate', 'MaternalDeathDate', 
# Children demographic characteristics
'ChildGender', 'ChildRace','ChildEthnicity',

# Prenatal Characteristics
'Mother_BMI_AtDelivery', 'PrePregnancyWeight_KG',
'SSRI_DuringPregnancy', 'Suboxone_DuringPregnancy',
'Methadone_DuringPregnancy', 'PositiveDrugScreenDuringPregnancy',
'HepB_PositiveDuringPregnancy', 'HCV_PositiveDuringPregnancy',
'HIV_PositiveDuringPregnancy', 'SyphilisPositiveDuringPregnancy',
'GBS_PositiveDuringPregnancy', 'Alcohol abuse',
'Antepartum Haemorrhage', 'Eclampsia', 'IUGR',
'Maternal chronic hypertension', 'Maternal Diabetes',
'Maternal pregnancy-induced hypertension', 'Oligohydramnios',
'Placenta previa', 'Polyhydramnios', 'Pre-eclampsia',
'Pregnancy Complications', 'Preterm premature rupture of membranes',
'Smoking during pregnancy', 'Spontaneous abortion',
'Umbilical cord prolapse', 'Vaginal delivery',
 
# Natal and Delivery Characteristics
'DeliveryType', 'ROM_Time', 'MultipleGestation',
# ER visit Characteristics
'ER_VisitDuringPregnancy', 'ChildER_VisitFirstMonth',
'ChildReadmissionFirstMonth', 'ChildOfficeVisitsFirstMonth',
# Ward characteristics
'ChildLengthOfStayHours', 'DischargeMonth','BirthWeight_Grams',
'SGA', 
       
       'AppointmentMadeDuringAdmission', 'Apgar1Minute', 'Apgar5Minute',
       'Cord_pH_Venous_Date', 'Cord_pH_Venous', 'Cord_pH_Arterial_Date',
       'Cord_pH_Arterial', 'MaxHeartRate', 'MinHeartRate', 'AvgHeartRate',
       'AvgPulseOx', 'Desaturations', 'MaxTemperature', 'MinTemperature',
       'MinBloodGlucose', 'MaxBloodGlucose', 'MinSerumBilirubin',
       'MaxSerumBilirubin', 'MaxTranscutaneousBilirubin',
       'MinTranscutaneousBilirubin', 'Phototherapy', 'FailedHearing',
       'CarSeatTestFailures', 'FeedingType', 'PercentWeightLoss'