In [1]:
from sktime.classification.kernel_based import TimeSeriesSVC, RocketClassifier, TimeSeriesSVCTslearn
from sktime.classification.interval_based import TimeSeriesForestClassifier

In [2]:
import pandas as pd

combined = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/combined.csv",index_col=['Primary_Index','ref_date'])
# combined = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/combined.csv")

In [3]:
combined.shape

(127694, 172)

In [4]:
exclude_columns = ['threshold_date', 'diagnosis_date', 'date_of_birth', 'date_of_death',
	'Practice', 'PatientID',
	'systolic', 'diastolic', 'weight', 'height', 'age_group',
	'race','vitals_id', 'mean_arterial_pressure', 'age', 'bsa', 'pulse', 'time_in_days']
df_filtered = combined.drop(columns=exclude_columns)

In [5]:
#Identify groups with only one entry
group_sizes = df_filtered.groupby(level='Primary_Index').size()

#Filter out groups with greater than 10 entries
indices_to_keep = group_sizes[group_sizes > 10].index
df_filtered = df_filtered.loc[indices_to_keep]

In [6]:
print(df_filtered.index.dtype)

object


In [7]:
df_filtered.sort_index(level=1, ascending=False, inplace=True)

In [8]:
df_filtered = df_filtered.groupby(level='Primary_Index').head(2)

In [9]:
# Convert 'ref_date' index to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(df_filtered.index.get_level_values('ref_date')):
    # Create a new MultiIndex with the 'ref_date' converted to datetime
    df_filtered.index = pd.MultiIndex.from_arrays([
        df_filtered.index.get_level_values('Primary_Index'),
        pd.to_datetime(df_filtered.index.get_level_values('ref_date'))
    ], names=['Primary_Index', 'ref_date'])

In [10]:
# Create a function to assign integer values based on order
def convert_secondary_index_to_int(df):
    df = df.copy()
    df['Secondary_Index_Int'] = df.groupby(level=0).cumcount()
    return df.set_index('Secondary_Index_Int', append=True).reset_index(level=1, drop=True)

# Apply the function to the dataframe
df_filtered = convert_secondary_index_to_int(df_filtered)

In [11]:
df_filtered

Unnamed: 0_level_0,Unnamed: 1_level_0,ethnicity_mapping,gender,marital_status,race_mapping,bmi,respiration,temperature,bmi_group,bsa_group,map_group,...,hepatitis_c,thyroid_gland_disorders,rheumatoid_arthritis,osteoarthritis,gerd,neurological_disorders,chronic_lung_disease,depression,hypercholesterolemia,is_diagnosed
Primary_Index,Secondary_Index_Int,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
8595,0,0.0,1.0,1.0,0.0,31.32,0.0,0.000000,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1
10073,0,0.0,0.0,1.0,0.0,26.81,16.0,97.500000,3.0,3.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
9507,0,0.0,1.0,0.0,0.0,0.00,18.0,96.900002,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8595,1,0.0,1.0,1.0,0.0,31.32,0.0,0.000000,4.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1
17174,0,0.0,1.0,0.0,0.0,0.00,12.0,0.000000,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2093,1,0.0,1.0,1.0,1.0,0.00,0.0,0.000000,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4615,1,0.0,1.0,1.0,1.0,0.00,16.0,98.000000,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
1135,0,1.0,1.0,1.0,1.0,0.00,18.0,99.000000,0.0,0.0,4.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1
1135,1,1.0,1.0,1.0,1.0,0.00,18.0,99.000000,0.0,0.0,4.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1


In [12]:
# set second index to datetime
df_filtered.index = df_filtered.index.set_levels([df_filtered.index.levels[0], pd.to_datetime(df_filtered.index.levels[1])])

In [13]:
from sktime.datatypes import check_raise

# Assuming df_filtered is your DataFrame with a possible MultiIndex structure
try:
    # Check if the DataFrame is in the correct 'pd-multiindex' format expected by sktime
    check_raise(df_filtered, mtype="pd-multiindex")
    print("Data is in the correct format for sktime.")
except Exception as e:
    # Handle the exception if the format is not correct
    print(f"Data format error: {e}")


Data is in the correct format for sktime.


In [21]:
X = df_filtered[[x for x in df_filtered.columns if x!='is_diagnosed']]
y = df_filtered['is_diagnosed'][::2]

In [22]:
y.index = X.index


ValueError: Length mismatch: Expected axis has 3123 elements, new values have 6246 elements

In [17]:
import numpy as np
from sktime.split import TemporalTrainTestSplitter
from sktime.forecasting.model_selection import temporal_train_test_split
def group_train_test_split(X, y, test_size=0.2):
    train_indices = []
    test_indices = []
    
    for group in X.index.get_level_values('Primary_Index').unique():
        X_group = X.loc[group]
        y_group = y.loc[group]
        X_train_group, X_test_group, y_train_group, y_test_group = temporal_train_test_split(X_group, y_group, test_size=test_size)
        
        train_indices.extend(X_train_group.index.tolist())
        test_indices.extend(X_test_group.index.tolist())
    
    return X.loc[train_indices], X.loc[test_indices], y.loc[train_indices], y.loc[test_indices]

# Apply the group_train_test_split function
X_train, X_test, y_train, y_test = group_train_test_split(X, y, test_size=0.2)

KeyError: "None of [DatetimeIndex(['1970-01-01 00:00:00.000000001'], dtype='datetime64[ns]', name='Secondary_Index_Int', freq=None)] are in the [index]"

In [19]:
y.index == X.index

ValueError: Lengths must match to compare

In [None]:
X.shape

In [None]:
X

In [None]:
import numpy as np
from sktime.split import TemporalTrainTestSplitter

splitter = TemporalTrainTestSplitter(test_size=0.2)

# Perform the split and save the indices into train and test variables
train_indices, test_indices = next(splitter.split(X.index.get_level_values(1)))
X_train = X.iloc[train_indices]
X_test = X.iloc[test_indices]
train_indices_y, test_indices_y = next(splitter.split(y.index.get_level_values(1)))
y_train = y.iloc[train_indices]
y_test = y.iloc[test_indices]

In [None]:
# Split the data
splitter = TemporalTrainTestSplitter(test_size=0.3)
train_indices, test_indices = next(splitter.split(X.index.get_level_values(1)))

# Extract train and test data using iloc for positional indexing
X_train = X.iloc[train_indices]
X_test = X.iloc[test_indices]
y_train = y.iloc[train_indices]
y_test = y.iloc[test_indices]

In [None]:
y_train

In [None]:
X_train.shape

rocket classifier

In [None]:
rocket = RocketClassifier()
rocket.fit(X, y)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
model1 = RocketClassifier()
# model1.set_config(n_jobs=-1)
model1.fit(X, y)

In [None]:
from sklearn.metrics import f1_score

f1_score(y, rocket.predict(X))

timeseries svc

In [None]:
model = TimeSeriesSVC()
# model.set_config('n_jobs'=-1)
model.fit(X, y)

In [None]:
from sklearn.metrics import f1_score

f1_score(y, model.predict(X))

In [None]:
# model.set_config({ 
#                   'backend:parallel': True,
#  'backend:parallel:params': True
#  })

In [None]:
# import numpy as np
# data = {
#     'Primary_Index': [0, 0, 1, 1],
#     'ref_date': ['2016-08-17', '2016-09-02', '2017-07-16', '2017-10-25'],
#     'ethnicity_mapping': [0.0, 0.0, 1.0, 1.0],
#     'gender': [0.0, 0.0, 1.0, 1.0],
#     'marital_status': [1.0, 1.0, 1.0, 1.0],
#     # Add more features as needed
# }

# df = pd.DataFrame(data)
# df['ref_date'] = pd.to_datetime(df['ref_date'])
# df.set_index(['Primary_Index', 'ref_date'], inplace=True)

# # Group by Primary_Index
# grouped = df.groupby(level=0)

# # Create the nested DataFrame dynamically
# nested_data = pd.DataFrame()

# for col in df.columns:
#     nested_data[col] = grouped[col].apply(list)

# # Convert lists to numpy arrays for sktime compatibility
# nested_data = nested_data.applymap(np.array)

In [None]:
grouped = df_filtered.groupby(level=0)

nested_data = pd.DataFrame()

for col in df_filtered.columns:
    if col=='is_diagnosed':
        continue
    try:
        nested_data[col] = grouped[col].apply(list)
    except:
        print(col)

In [None]:
import copy

nested_data = copy.deepcopy(nested_data)

In [None]:
nested_data = nested_data.applymap(np.array)

In [None]:
df_filtered.columns

In [None]:


# Convert 'ref_date' index to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(df_filtered.index.get_level_values('ref_date')):
    # Create a new MultiIndex with the 'ref_date' converted to datetime
    df_filtered.index = pd.MultiIndex.from_arrays([
        df_filtered.index.get_level_values('Primary_Index'),
        pd.to_datetime(df_filtered.index.get_level_values('ref_date'))
    ], names=['Primary_Index', 'ref_date'])


In [None]:
print(df_filtered.index.get_level_values('ref_date'))

In [None]:
# Reprint the index to check
print(df_filtered.index)

In [None]:
from sktime.datatypes import check_raise

try:
    check_raise(df_filtered, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")


In [None]:
df_filtered = df_filtered.sort_index()

In [None]:
from sktime.datatypes import check_raise

try:
    check_raise(df_filtered, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")

In [None]:
df_filtered

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from sktime.classification.deep_learning import InceptionTimeClassifier
from sktime.datasets import load_unit_test  
X_train, y_train = load_unit_test(split="train")  
X_test, y_test = load_unit_test(split="test")  
clf = InceptionTimeClassifier()  
clf.fit(X_train, y_train)  

In [None]:
load_unit_test(split="train")  

In [None]:
features = df_filtered.drop(columns=['is_diagnosed', 'time_in_days'])
target = df_filtered['is_diagnosed']

In [None]:
features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sktime.classification.deep_learning import InceptionTimeClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
import sktime.classification.kernel_based as something

In [None]:
# Convert MultiIndex DataFrame to nested format expected by sktime
X_nested = features.groupby(level=0).apply(lambda g: g.droplevel(0).reset_index(drop=True))
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_nested, target, test_size=0.2, random_state=42)


In [None]:
# Convert MultiIndex DataFrame to the required format
def convert_to_sktime_format(df):
    grouped = df.groupby(level=0)
    nested_list = [group.droplevel(0).reset_index(drop=True).T for _, group in grouped]
    nested_df = pd.DataFrame(nested_list)
    return nested_df

# Convert the features DataFrame to the required format
X_sktime = convert_to_sktime_format(features)

# Ensure target is properly aligned
y_sktime = target.groupby(level=0).first()

In [None]:
X_train = tuple(map(tuple, X_train.values))
X_test = tuple(map(tuple, X_test.values))
y_train = tuple(y_train.values)
y_test = tuple(y_test.values)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = clf.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' to handle multiclass classification

# Print the results
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

In [None]:
X_train

In [None]:
# from sktime.utils.data_processing import from_2d_array_to_nested
from sktime.classification.kernel_based import RocketClassifier
from sklearn.model_selection import train_test_split

# Assuming df_filtered is your DataFrame and you have a target column
X = df_filtered.drop(columns=['is_diagnosed','time_in_days']) # Feature set
y = df_filtered['is_diagnosed']  # Target variable

In [None]:
from sklearn.model_selection import train_test_split

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(df_filtered, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
test = load_unit_test(split='train')

In [None]:
test[1]

In [None]:
test[0]

In [None]:
test[0].loc[9, 'dim_0']

In [None]:
# Ensure the DataFrame is sorted by the index
df_filtered = df_filtered.sort_index()

# Convert MultiIndex DataFrame to the required format
def convert_to_sktime_format(df):
    grouped = df.groupby(level=0)
    nested_list = [group.droplevel(0).reset_index(drop=True).T for _, group in grouped]
    nested_df = pd.DataFrame(nested_list)
    return nested_df

# Convert the features DataFrame to the required format
X_sktime = convert_to_sktime_format(df_filtered)

# Ensure target is properly aligned
y_sktime = y.groupby(level=0).first()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sktime.classification.deep_learning import InceptionTimeClassifier
from sklearn.metrics import accuracy_score, f1_score

df = df_filtered
# Assuming 'target' is the column name for your target variable
target_column = 'is_diagnosed'  # Replace with your actual target column name

# Check if 'time_in_days' needs to be dropped
if 'time_in_days' in df.columns:
    df = df.drop(columns=['time_in_days'])

# Ensure the primary index is set correctly for both features and target
df = df.set_index(['Primary_Index', 'ref_date'])
target = df[target_column].groupby('Primary_Index').first()
features = df.drop(columns=[target_column])

# Convert MultiIndex DataFrame to nested format expected by sktime
def convert_to_nested(df):
    # Group by primary index and convert each group to a Series
    nested_df = df.groupby(level=0).apply(lambda g: g.droplevel(0).reset_index(drop=True))
    return nested_df

# Assuming your DataFrame is already set with a MultiIndex
X_nested = convert_to_nested(features)

# Ensure target is properly aligned
y_nested = target.groupby(level=0).first()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_nested, y_nested, test_size=0.2, random_state=42)

# Initialize the classifier
clf = InceptionTimeClassifier()

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the results
print(f"\nAccuracy: {accuracy}")
print(f"F1 Score: {f1}")


In [None]:
clf = InceptionTimeClassifier()  
clf.fit(X_train, y_train)  

In [None]:
import pandas as pd

# Load data into a pandas DataFrame
file_path = '/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/FeatureEngineering/Heart_I25/test1/diagnosed.csv'  # Replace with your file path
filepath1 = '/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/FeatureEngineering/Heart_I25/test1/normal.csv'
diagnosed_df = pd.read_csv(file_path)
normal_df = pd.read_csv(filepath1)

In [None]:


# Create a unique identifier for each (PatientID, Practice) combination
diagnosed_df['Primary_Index'] = diagnosed_df.groupby(['PatientID', 'Practice']).ngroup()
diagnosed_df.set_index(['Primary_Index', 'ref_date'], inplace=True)
# Sort the index for better readability
diagnosed_df.sort_index(inplace=True)


# Create a unique identifier for each (PatientID, Practice) combination
normal_df['Primary_Index'] = normal_df.groupby(['PatientID', 'Practice']).ngroup()
normal_df.set_index(['Primary_Index', 'ref_date'], inplace=True)
# Sort the index for better readability
normal_df.sort_index(inplace=True)

In [None]:
normal_df.to_csv('normal_dataset.csv', index=True)
diagnosed_df.to_csv('diagnosed_dataset.csv', index=True)

In [None]:
pd.read_csv('/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/FeatureEngineering/diagnosed_dataset.csv', index_col=[0,1])

In [None]:
normal_df = normal_df.drop(columns=['race', 'age', 'Practice'])
diagnosed_df = diagnosed_df.drop(columns=['race', 'age', 'Practice'])

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# # Split the DataFrame into two parts
# df_diagnosed, df_normal = train_test_split(diagnosed_df, test_size=0.5, random_state=42)

# # Save each split DataFrame into separate CSV files
# df_diagnosed.to_csv('fake_diagnosed.csv', index=True)
# df_normal.to_csv('fake_normal.csv', index=True)

In [None]:

normal_df['threshold_date'] = pd.to_datetime(normal_df['threshold_date'])
normal_df['latest_encounter'] = pd.to_datetime(normal_df['latest_encounter'])
normal_df['date_of_birth'] = pd.to_datetime(normal_df['date_of_birth'])
normal_df['date_of_death'] = pd.to_datetime(normal_df['date_of_death'])

diagnosed_df['threshold_date'] = pd.to_datetime(diagnosed_df['threshold_date'])
diagnosed_df['diagnosis_date'] = pd.to_datetime(diagnosed_df['diagnosis_date'])
diagnosed_df['date_of_birth'] = pd.to_datetime(diagnosed_df['date_of_birth'])
diagnosed_df['date_of_death'] = pd.to_datetime(diagnosed_df['date_of_death'], errors='coerce')



In [None]:
from sktime.datatypes import check_raise

# Check if the DataFrame is compatible with sktime
try:
    check_raise(normal_df, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")

In [None]:
from sktime.datatypes import check_raise

# Check if the DataFrame is compatible with sktime
try:
    check_raise(diagnosed_df, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")

In [None]:
object_columns = diagnosed_df.select_dtypes(include=['object']).columns
object_columns

In [None]:
print(normal_df.dtypes)

### RocketClassifier

In [None]:
import sktime

sktime.__version__

In [None]:
df_filtered.index.names

In [None]:
import pandas as pd

# Check the current type of the 'ref_date' index
print(df_filtered.index.get_level_values('ref_date'))

# Convert 'ref_date' index to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(df_filtered.index.get_level_values('ref_date')):
    # Create a new MultiIndex with the 'ref_date' converted to datetime
    df_filtered.index = pd.MultiIndex.from_arrays([
        df_filtered.index.get_level_values('Primary_Index'),
        pd.to_datetime(df_filtered.index.get_level_values('ref_date'))
    ], names=['Primary_Index', 'ref_date'])


In [None]:
# Reprint the index to check
print(df_filtered.index)


In [None]:
from sktime.datatypes import check_raise

try:
    check_raise(df_filtered, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")


In [None]:
df_filtered = df_filtered.sort_index()

In [None]:
# Check for duplicates in the index
if df_filtered.index.duplicated().any():
    print("There are duplicate indices.")
    # Optional: Drop duplicates if necessary
    df_filtered = df_filtered[~df_filtered.index.duplicated(keep='first')]
else:
    print("No duplicate indices found.")


In [None]:
# from sktime.utils.data_processing import from_2d_array_to_nested
from sktime.classification.kernel_based import RocketClassifier
from sklearn.model_selection import train_test_split

# Assuming df_filtered is your DataFrame and you have a target column
X = df_filtered.drop(columns=['is_diagnosed','time_in_days']) # Feature set
y = df_filtered['is_diagnosed']  # Target variable




In [None]:
RocketClassifier()

In [None]:
import pandas as pd

# Assuming 'X' is a DataFrame where each row is a time series
# Create a nested DataFrame
def convert_to_nested(df):
    nested_data = {col: [pd.Series(df[col].iloc[i]) for i in range(df.shape[0])] for col in df.columns}
    return pd.DataFrame(nested_data)

X_nested = convert_to_nested(X)

# Verify the structure
print(X_nested.head())
print(f"Shape of X_nested: {X_nested.shape}")
print(f"Length of y: {len(y)}")


In [None]:
len(set([x[0] for x in (X.index.to_list())]))

In [None]:
# X_nested.to_csv("x_nested.csv")

In [None]:
from sklearn.model_selection import train_test_split

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_nested, y, test_size=0.2, random_state=42)


In [None]:
from sktime.classification.kernel_based import RocketClassifier
from sklearn.metrics import accuracy_score, f1_score
# Initialize and train the RocketClassifier
rocket_clf = RocketClassifier()
rocket_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = rocket_clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy:.2f}")


#### for nested data

In [None]:
from sklearn.metrics import accuracy_score, f1_score

f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

#### for pd.multiindex

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sktime.classification.kernel_based import RocketClassifier

# Initialize and train the RocketClassifier
rocket_clf = RocketClassifier()
rocket_clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Predict on the test set
y_pred = rocket_clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


#### for pd.wide

In [None]:
sktime.__version__

In [None]:
# Evaluate the classifier
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
!pip install sktime

In [None]:
from sktime.utils.data_processing import from_2d_array_to_nested
from sklearn.model_selection import train_test_split

X_nested = from_2d_array_to_nested(X)

In [None]:
from sktime.datatypes import check_raise

# Check if the DataFrame is compatible with sktime
try:
    check_raise(df_filtered, mtype="pd-multiindex")
    print("Data is in the correct format.")
except Exception as e:
    print(f"Data format error: {e}")

In [None]:
from sktime.forecasting.model_selection import temporal_train_test_split

TemporalTrainTestSplitter(x, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# x_train = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/X_train.csv", index_col=[0,1])
# y_train = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/y_train.csv", index_col=[0,1])

# x_test = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/X_test.csv", index_col=[0,1])
# y_test = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/y_test.csv", index_col=[0,1])

# x_val = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/x_val.csv", index_col=[0,1])
# y_val = pd.read_csv("/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/Modelling/y_val.csv", index_col=[0,1])


In [None]:
x_train.head(5)

In [None]:
x_train.index

In [None]:
list(x_train.columns)

In [None]:
from sktime.classification.kernel_based import RocketClassifier
from sktime.datasets import load_unit_test
# X_train, y_train = load_unit_test(split="train", return_X_y=True)
# X_test, y_test = load_unit_test(split="test", return_X_y=True) 
clf = RocketClassifier(num_kernels=500) 
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test) 


In [None]:
load_unit_test(return_X_y=False)

In [None]:
type(load_unit_test)

In [None]:
print(load_unit_test)

In [None]:
# for simplest evaluation, compare ground truth to predictions
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

In [None]:
# import to retrieve examples
from sktime.datatypes import get_examples
get_examples(mtype="pd-multiindex", as_scitype="Panel")[0]

In [None]:
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.datasets import load_osuleaf

# data should be split into train/test
X_train, y_train = load_osuleaf(split="train", return_type="numpy3D")
X_test, y_test = load_osuleaf(split="test", return_type="numpy3D")
X_test = X_test[:2]
y_test = y_test[:2]

# step 3-5 are the same
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.dists_kernels.compose_tab_to_panel import AggrDist
from sktime.dists_kernels import ScipyDist

mean_eucl_dist = AggrDist(ScipyDist())
clf = KNeighborsTimeSeriesClassifier(n_neighbors=3, distance=mean_eucl_dist)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# for simplest evaluation, compare ground truth to predictions
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

In [None]:
X_train

In [None]:
from sktime.classification.kernel_based import RocketClassifier
from sktime.datasets import load_unit_test
X_train, y_train = load_unit_test(split="train", return_X_y=True)
X_test, y_test = load_unit_test(split="test", return_X_y=True) 
clf = RocketClassifier(num_kernels=500) 
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 

In [None]:
X_train

In [None]:
load_unit_test(split='test', return_X_y=True, return_type=None)

In [None]:
# for simplest evaluation, compare ground truth to predictions
from sklearn.metrics import accuracy_score, f1_score

print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

In [None]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('/ext01/medgp1167/ushbahh/NEW_DISEASES/Chronic Disease Modelling/FeatureEngineering/Hyperlipidemia_E78/test_temporal1/diagnosed.csv')

# Convert 'date' and 'ref_date' to datetime
df['date'] = pd.to_datetime(df['date'])
df['ref_date'] = pd.to_datetime(df['ref_date'])

# Sort the data
df = df.sort_values(['PatientID', 'ref_date', 'date'])

# Feature Scaling (excluding non-numeric columns)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[df.columns.difference(['PatientID', 'date', 'ref_date', 'target_column'])] = scaler.fit_transform(df[df.columns.difference(['PatientID', 'date', 'ref_date', 'target_column'])])

# Create sequences for each ref_date
def create_sequences(df, seq_length=5):
    sequences = []
    targets = []
    for patient in df['PatientID'].unique():
        patient_data = df[df['PatientID'] == patient]
        for ref_date in patient_data['ref_date'].unique():
            ref_data = patient_data[patient_data['ref_date'] == ref_date]
            for i in range(len(ref_data) - seq_length + 1):
                sequence = ref_data.iloc[i:i + seq_length].drop(['PatientID', 'ref_date', 'target_column'], axis=1).values
                target = ref_data['target_column'].values[i + seq_length - 1]
                sequences.append(sequence)
                targets.append(target)
    return np.array(sequences), np.array(targets)

seq_length = 5
X, y = create_sequences(df, seq_length)

# Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')
