In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.model_selection import train_test_split
import os
from pathlib import Path

def inspect_single_file(file_path, patient_id=0):
    print("=" * 60)
    print(f"STEP 1: INSPECTING PATIENT FILE {patient_id}")
    print("=" * 60)

    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Basic information
        print(f" File: {os.path.basename(file_path)}")
        print(f" Shape: {df.shape} (rows: {df.shape[0]}, columns: {df.shape[1]})")
        print(f" Time range: {df['time'].min():.2f} to {df['time'].max():.2f} seconds")

        # Column information
        print(f" Columns: {list(df.columns)}")
        print(f" Data types:")
        print(df.dtypes)

        # Check for missing values
        print(f" Missing values per column:")
        missing_data = df.isnull().sum()
        print(missing_data[missing_data > 0])

        # Basic statistics
        print(f" Basic statistics:")
        print(df.describe())

        # Show first few rows
        print(f" First 5 rows:")
        print(df.head())

        return df

    except Exception as e:
        return None

# Let's test with one file first
patient_files = list(Path("100f").glob("*.csv"))
if patient_files:
    sample_df = inspect_single_file(patient_files[0], patient_id=0)

STEP 1: INSPECTING PATIENT FILE 0
 File: Run1.csv
 Shape: (15354, 5) (rows: 15354, columns: 5)
 Time range: 0.00 to 153.53 seconds
 Columns: ['time', 'paw', 'flow', 'vol', 'pmus']
 Data types:
time    float64
paw     float64
flow    float64
vol     float64
pmus    float64
dtype: object
 Missing values per column:
Series([], dtype: int64)
 Basic statistics:
               time           paw          flow           vol          pmus
count  15354.000000  15354.000000  15354.000000  15354.000000  15354.000000
mean      76.765000      9.284987     -0.000142      0.084936     -0.831763
std       44.324624      2.490251      0.386601      0.132017      1.374469
min        0.000000      5.849335     -0.805350     -0.024151     -5.473509
25%       38.382500      8.300838     -0.104110     -0.001411     -1.173121
50%       76.765000      8.417437     -0.022147      0.022635     -0.087534
75%      115.147500      8.985041      0.006336      0.116948     -0.030532
max      153.530000     18.240479