In [1]:
# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/time-series-tools
!pip install seglearn --no-index --find-links=file:///kaggle/input/time-series-tools

Looking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/tsflex-0.3.0-py3-none-any.whl
Installing collected packages: tsflex
Successfully installed tsflex-0.3.0
[0mLooking in links: file:///kaggle/input/time-series-tools
Processing /kaggle/input/time-series-tools/seglearn-1.2.5-py3-none-any.whl
Installing collected packages: seglearn
Successfully installed seglearn-1.2.5
[0m

# Data loading and feature extraction 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn import *
import glob
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from seglearn.feature_functions import base_features, emg_features

from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper


import pathlib
import xgboost as xgb
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import  StratifiedGroupKFold
from scipy.stats import uniform, randint
from sklearn.metrics import average_precision_score, make_scorer

from scipy.signal import find_peaks, chirp, find_peaks, peak_widths

import warnings
warnings.filterwarnings('ignore')


In [3]:
na=-11

In [4]:
# Reduce Memory Usage
# reference : https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN

def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

In [5]:
p = '/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/'

train = glob.glob(p+'train/**/**')
test = glob.glob(p+'test/**/**')
unlabeled=glob.glob(p+'unlabeled/**')
subjects = pd.read_csv(p+'subjects.csv')
tasks = pd.read_csv(p+'tasks.csv')
events = pd.read_csv(p+'events.csv')
tasks = pd.read_csv(p+'tasks.csv')
sub = pd.read_csv(p+'sample_submission.csv')

tdcsfog_metadata=pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv')
defog_metadata=pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv')
daily_metadata=pd.read_csv('/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/daily_metadata.csv')
tdcsfog_metadata['Module']='tdcsfog'
defog_metadata['Module']='defog'
daily_metadata['Module']='daily'
metadata=pd.concat([tdcsfog_metadata,defog_metadata, daily_metadata])

In [6]:
pathlib.Path(unlabeled[1]).parts[-2]

'unlabeled'

In [7]:
def encoder(file, file2=None, all_cols=False, cols=None, combined=False):
    le=LabelEncoder()
    cat_cols=cols
    if all_cols:
        cat_cols=file.select_dtypes('O').columns
    if combined:
        file['train']=1
        file2['train']=0
        file=pd.concat([file, file2])
    for col in cat_cols:
        if col not in ['Id', 'Subject']:
            file[col]=le.fit_transform(np.array(file[col]).reshape(-1, 1))
    if combined:
        train, test=file[file['train']==1], file[file['train']==0]
        return train[cols], test[cols]
    else:
        return file

def get_columns(cols):
    new_cols=[]
    lst_suffix=['mean', 'sum', 'count', 'max', 'min', 'std']
    for col in cols:
        if col not in ['Id', 'Subject']:
            for suffix in lst_suffix:
               new_cols.append(col+f'_{suffix}')
        else:
            new_cols.append(col)
    return new_cols
        
    

def get_scaler(data, types):
    if types=='std':
        scaler=StandardScaler()
        cols=data.select_dtypes('number').columns
        for col in cols:
            data[col]=scaler.fit_transform(np.array(data[col]).reshape(-1, 1))
        return data
            
        
    
def label_cluster(data, n_clusters=10):
    data=data.select_dtypes('number')
    data=get_scaler(data, 'std')
    clustered=cluster.KMeans(n_clusters=n_clusters, random_state=3).fit_predict(data)
    return clustered

def get_pca(data1, data2=None, n_components=2, combined=False, transform=False):
    if combined:
        #data['train']=1
        #data2['train']=0
        data=pd.concat([data1, data2])
        rows=data1.shape[0]
        del data1, data2
        gc.collect()
    else:
        data=data1
    data=data.select_dtypes('number')
    data=get_scaler(data, 'std')
    model=PCA(n_components=n_components)
    new_features=model.fit_transform(data)
    var=model.explained_variance_ratio_
    while sum(var)<.95:
        n_components +=n_components+2
        model=PCA(n_components=n_components)
        model=model.fit(data)
        var=model.explained_variance_ratio_
    if transform:
        return model
    elif combined:
        #train=new_features[:rows]
        #test=new_features[rows:]
        n_cols=n_components
        del data
        gc.collect()
        print(sum(var), n_cols)
        return n_cols, model
    else:
        print(var)
        return data


events=encoder(events, all_cols=True)
events['duration_events']=events['Completion']-events['Init']
events_cols=get_columns(events.columns)
events=events.groupby('Id').agg(['mean', 'sum', 'count', 'max', 'min', 'std']).reset_index().fillna(na)
events.columns=events_cols
events.head(3)

tasks['duration_task']=tasks['End']-tasks['Begin'] 
tasks=encoder(tasks, all_cols=True)
tasks_cols=get_columns(tasks.columns)
tasks=tasks.groupby('Id').agg(['mean', 'sum', 'count', 'max', 'min', 'std']).reset_index()
tasks.columns=tasks_cols
tasks.head(3)

In [8]:
# https://www.kaggle.com/code/jazivxt/familiar-solvs
tasks['Duration'] = tasks['End'] - tasks['Begin']
tasks = pd.pivot_table(tasks, values=['Duration'], index=['Id'], columns=['Task'], aggfunc='sum', fill_value=0)
tasks.columns = [c[-1] for c in tasks.columns]
tasks = tasks.reset_index()
tasks['t_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(tasks[tasks.columns[1:]])

subjects = subjects.fillna(0).groupby('Subject').median()
subjects = subjects.reset_index()
# subjects.rename(columns={'Subject':'Id'}, inplace=True)
subjects['s_kmeans'] = cluster.KMeans(n_clusters=10, random_state=3).fit_predict(subjects[subjects.columns[1:]])
subjects=subjects.rename(columns={'Visit':'s_Visit','Age':'s_Age','YearsSinceDx':'s_YearsSinceDx','UPDRSIII_On':'s_UPDRSIII_On','UPDRSIII_Off':'s_UPDRSIII_Off','NFOGQ':'s_NFOGQ'})

# display(tasks)
# display(subjects)


subjects['NFOGQ/Visit']=subjects['NFOGQ']/subjects['Visit']
subjects['diff_offon']=subjects['UPDRSIII_Off']-subjects['UPDRSIII_On']
subjects['started_year']=subjects['Age']-subjects['YearsSinceDx']
#subjects['rateof_fog']=subjects['NFOGQ']/subjects['YearsSinceDx']
subjects['rateof_fog/age']=subjects['NFOGQ']/(subjects['YearsSinceDx']+subjects['Age'])
subjects=encoder(subjects, all_cols=True)
subjects_cols=get_columns(subjects.columns)
subjects=subjects.groupby('Subject').agg(['mean', 'sum', 'count', 'max', 'min', 'std']).reset_index()
subjects.columns=subjects_cols
subjects=subjects.fillna(na)
subjects=reduce_memory_usage(subjects)
subjects.head(3)

In [9]:
#events['cluster_events']=label_cluster(events)
#events[['events_pca1', 'events_pca2']]=get_pca(events)
#selected_events=events[['Id','cluster_events', 'events_pca1', 'events_pca2']]

#tasks['cluster_tasks']=label_cluster(tasks)
#tasks[['tasks_pca1', 'tasks_pca2']]=get_pca(tasks)
#selected_task=tasks[['Id','cluster_tasks', 'tasks_pca1', 'tasks_pca2']]

#subjects['cluster_subjects']=label_cluster(subjects)
#subjects[['subjects_pca1', 'subjects_pca2']]=get_pca(subjects)
#selected_subjects=subjects[['Subject', 'cluster_subjects', 'subjects_pca1', 'subjects_pca2']]

In [10]:
complex_featlist=['Visit','Test','Medication','s_Visit','s_Age','s_YearsSinceDx','s_UPDRSIII_On','s_UPDRSIII_Off','s_NFOGQ','s_kmeans']
metadata_complex=metadata.merge(subjects,how='left',on='Subject').copy()
metadata_complex['Medication']=metadata_complex['Medication'].factorize()[0]


In [11]:
meta_dict={'tdcsfog':1, 'defog':2, 'notype':3}

## Create a tsflex feature collection

In [12]:
basic_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(base_features()),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5_000],
    strides=[5_000],
)

emg_feats = emg_features()
del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)

emg_feats = MultipleFeatureDescriptors(
    functions=seglearn_feature_dict_wrapper(emg_feats),
    series_names=['AccV', 'AccML', 'AccAP'],
    windows=[5_000],
    strides=[5_000],
)

fc = FeatureCollection([basic_feats, emg_feats])

## Extract the features (with Time_frac feature)

In [13]:
def get_peaks(df, col):
    peaks, properties = find_peaks(df[col], prominence=(None, 0.6))
    df.loc[peaks, 'peaks']=peaks
    df['peaks']=df['peaks'].bfill()
    peaks_value=df['peaks'].map(df[['peaks']].groupby(['peaks']).size().to_dict())
    return peaks_value
def get_features(df, col, n_lag):
    for i in range(1, n_lag):
        df[f'lag_{col}_{i}']=df[col].shift(i).fillna(0)
    return df[[f'lag_{col}_{j}' for j in range(1, n_lag)]]

In [14]:

import pathlib
def reader(f, all=False, train=True, unlabeled=False):
    if not all:
        df = pd.read_csv(f,  usecols=['AccV', 'AccML', 'AccAP'])
        df = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
        return df
        
    else:
        if train:
            if 'defog'==pathlib.Path(f).parts[-2]:
                cols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation','Turn' ,'Walking', 'Task', 'Valid']
            else:
                cols=['Time', 'AccV', 'AccML', 'AccAP', 'StartHesitation','Turn' ,'Walking']

        
        
        else:
            cols=['AccV', 'AccML', 'AccAP', 'Time']
        try:
            if unlabeled:
                df = pd.read_parquet(f) 
                df['Id']=f.split('/')[-1].split('.')[0]
                df['Module'] = pathlib.Path(f).parts[-2]
            else:
                df = pd.read_csv(f, index_col="Time", usecols=cols)
                df['Id'] = f.split('/')[-1].split('.')[0]
                df['Module'] = pathlib.Path(f).parts[-2]
                #if 'Task' in df.columns and 'Valid' in df.columns:
                #    df=df[(df['Task']==True) &(df['Valid']==True)]
            #if 'tdcsfog' == pathlib.Path(f).parts[-2]:
            #   df.AccV = df.AccV / 9.80665
            #   df.AccML = df.AccML / 9.80665
            #    df.AccAP = df.AccAP / 9.80665
            df['Time_frac']=(df.index/df.index.max()).values#currently the index of data is actually "Time"
            #df[[f'lag_AccV_{i}' for i in range(1, 6)]]=get_features(df, 'AccV', 6)
            #df[[f'lag_AccML_{i}' for i in range(1, 6)]]=get_features(df, 'AccML', 6)
            #df[[f'lag_AccML_{i}' for i in range(1, 6)]]=get_features(df, 'AccAP', 6)
            #df['peaks_AccV']=get_peaks(df, 'AccV')
            #df['peaks_AccML']=get_peaks(df, 'AccML')
            #df['peaks_AccAP']=get_peaks(df, 'AccAP')
            #df['ma_acc_v'] = df["AccV"].rolling(25).mean()
            #df['ma_acc_ml'] = df["AccML"].rolling(25).mean()
            #df['ma_acc_ap'] = df["AccAP"].rolling(25).mean()
            # Moving std
            #df['std_acc_v'] = df["AccV"].rolling(25).std()
            #df['std_acc_ml'] = df["AccML"].rolling(25).std()
            #df['std_acc_ap'] = df["AccAP"].rolling(25).std()
            #df['Accv_change_1']=abs((df['AccV']-df['AccV'].shift(1)))
            #df['AccML_change_1']=abs((df['AccML']-df['AccML'].shift(1)))
            #df['AccAP_change_1']=abs((df['AccAP']-df['AccAP'].shift(1)))
            #df['all_change']=(abs(df[['Accv_change_1', 'AccML_change_1', 'AccAP_change_1']])).sum(axis=1)
            #df['Accv_change']=(df['AccV']-df['AccV'].shift(5))
            #df['AccML_change']=(df['AccML']-df['AccML'].shift(5))
            #df['AccAP_change']=(df['AccAP']-df['AccAP'].shift(5))
            df = pd.merge(df, tasks[['Id','t_kmeans']], how='left', on='Id').fillna(-1)
            #df = pd.merge(df, subjects[['Id','s_kmeans']], how='left', on='Id').fillna(-1)
            #df=df.merge(metadata[['Id', 'Subject']], how='left', on='Id')
            df = pd.merge(df, metadata_complex[['Id','Subject']+['Visit','Test','Medication','s_kmeans']], how='left', on='Id').fillna(-1)
            df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin").astype(np.float32)
            #drop_cols=df_feats.columns
            #df_feats[pca_cols]=pca_model.transform(df_feats)
            #df_feats.drop(drop_cols, axis=1, inplace=True)
            df = df.merge(df_feats, how="left", left_index=True, right_index=True)
            df.fillna(method="ffill", inplace=True)
            #df['AccV']=abs(df['AccV'])
            #df['AccML']=abs(df['AccML'])
            #df['AccAP']=abs(df['AccAP'])
            return df
        except: pass

In [15]:
def sample_reader(f, col, module):
    df = pd.read_csv(f)
    df['Id'] = f.split('/')[-1].split('.')[0]
    df['Module'] = pathlib.Path(f).parts[-2]
    df=df[df['Module']==module]
    if 'Task' in df.columns and 'Valid' in df.columns:
        df=df[(df['Task']==True) &(df['Valid']==True)]
    df=df.merge(metadata[['Id', 'Subject']], how='left', on='Id')
    return df
    
def get_data_index(f, col, module):
    train=pd.concat([sample_reader(files, col, module) for files in tqdm(f)])
    train=train.reset_index(drop=True)
    train_index=[]
    val_index=[]
    kfold = StratifiedGroupKFold(5)
    group_var = train.Subject
    for fold, (tr_idx,te_idx ) in enumerate(tqdm(kfold.split(train, train[col], group_var))):
        train_index.append(train.loc[tr_idx, 'Id'].unique())
        val_index.append(train.loc[te_idx, 'Id'].unique())
    del train
    gc.collect()
    return train_index, val_index

In [16]:
#turn_train_idx, turn_val_indx=get_data_index(train, 'Turn', 'tdcsfog')
#turn_train_idx2, turn_val_indx2=get_data_index(train, 'Turn', 'defog')
#walk_train_idx, walk_val_indx=get_data_index(train, 'Walking', 'tdcsfog')
#walk_train_idx2, walk_val_indx2=get_data_index(train, 'Walking', 'defog')
#st_train_idx, st_val_indx=get_data_index(train, 'StartHesitation', 'tdcsfog')
#st_train_idx2, st_val_indx2=get_data_index(train, 'StartHesitation', 'defog')

In [17]:
def save_index(label,data, data2):
    df=pd.DataFrame()
    lst=[]
    for i in range(5):   
        idx1=[ids for ids in  train if ids.split('/')[-1].split('.')[0] in data[i]]
        idx=idx1+([ids for ids in  train if ids.split('/')[-1].split('.')[0] in data2[i]])
        #lst.append(idx)
        temp=pd.DataFrame(idx, columns=[f'{label}_{i}'])
        #df[f'{label}_{i}']=idx
        df=pd.concat([df, temp], axis=1)
    df.to_csv(f'{label}.csv', index=False)
        
    

In [18]:
#save_index('Turn', turn_train_idx, turn_train_idx2)
#save_index('Turn2', turn_val_indx, turn_val_indx)
#save_index('Walking', walk_train_idx, walk_train_idx2)
#save_index('Walking2', walk_val_indx, walk_val_indx2)
#save_index('StartHesitation', st_train_idx, st_train_idx2)
#save_index('StartHesitation2', st_val_indx, st_val_indx2)

In [19]:
def get_clean_feat(train, test, train_df=True):
    train['lag_AccV']=train[[f'lag_AccV_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)
    test['lag_AccV']=test[[f'lag_AccV_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)

    train['lag_AccML']=train[[f'lag_AccML_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)
    test['lag_AccML']=test[[f'lag_AccML_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)

    train['lag_AccAP']=train[[f'lag_AccAP_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)
    test['lag_AccAP']=test[[f'lag_AccAP_{i}' for i in range(1, 6)]].astype(str).sum(axis=1)

    train['lag_AccV'], test['lag_AccV']=encoder(train[['lag_AccV']],file2=test[['lag_AccV']], cols=['lag_AccV'], combined=True)
    train['lag_AccML'], test['lag_AccML']=encoder(train[['lag_AccML']],file2=test[['lag_AccML']], cols=['lag_AccML'], combined=True)
    train['lag_AccAP'], test['lag_AccAP']=encoder(train[['lag_AccAP']],file2=test[['lag_AccAP']], cols=['lag_AccAP'], combined=True)

    train=train.drop([f'lag_AccV_{i}' for i in range(1, 6)]+[f'lag_AccAP_{i}' for i in range(1, 6)]+[f'lag_AccML_{i}' for i in range(1, 6)], axis=1)
    test=test.drop([f'lag_AccV_{i}' for i in range(1, 6)]+[f'lag_AccAP_{i}' for i in range(1, 6)]+[f'lag_AccML_{i}' for i in range(1, 6)], axis=1)
    if train_df:
        return train
    else:
        return test
    

## Train the model

In [20]:
best_params_ = {'estimator__colsample_bytree': 0.5282057895135501, 
 'estimator__learning_rate': 0.22659963168004743, 
 'estimator__max_depth': 8, 
 'estimator__min_child_weight': 3.1233911067827616, 
 'estimator__n_estimators': 291, 
 'estimator__subsample': 0.9961057796456088}
best_params_ = {kk: v for k, v in best_params_.items() for kk in k.split('__')}; del best_params_['estimator']

In [21]:
def custom_average_precision(y_true, y_pred):
    score = average_precision_score(y_true, y_pred)
    return 'average_precision', score, True

In [22]:
def train_model(train, label):
    #train=pd.concat([reader(f, all=True, train=True) for f in tqdm(train)])
    train=train.reset_index(drop=True)
    train=reduce_memory_usage(train)
    train['Module']=train['Module'].map(meta_dict)
    cols = [c for c in train.columns if c not in ['Id','Subject', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
    pcols = ['StartHesitation', 'Turn' , 'Walking']
    scols = ['Id', 'StartHesitation', 'Turn' , 'Walking']
    cols=train[cols].select_dtypes('number').columns
    cvs=[]
    preds=[]
    #oof=np.zeros(train.shape[0])
    preds=[]
    for i in range(5):
       print(f'fold is {i+1} of 15' )
       train_ids=pd.read_csv(f'/kaggle/input/subject/{label}.csv')[f'{label}_{i}'].dropna().str.split('/', expand=True)[6].str.split('.', expand=True)[0]
       val_ids=pd.read_csv(f'/kaggle/input/subject/{label}2.csv')[f'{label}2_{i}'].dropna().str.split('/', expand=True)[6].str.split('.', expand=True)[0]
       #tr_idx=pd.Series(tr_idx).sample(n=1000000,random_state=42).values #2000000
       # Create a base XGBoost regressor with the common parameters
       #model = lgb.LGBMClassifier(n_estimators=1000, class_weight='balanced', learning_rate=0.156443343)
       model=lgb.LGBMRegressor(**best_params_)
       
       #x_tr,y_tr=train[cols],train[label]
       #x_te,y_te=val[cols],val[label]

       model.fit(
       train[train['Id'].isin(train_ids)][cols],train[train['Id'].isin(train_ids)][label],
       eval_set=(train[train['Id'].isin(val_ids)][cols], train[train['Id'].isin(val_ids)][label]),
       eval_metric=custom_average_precision,
       early_stopping_rounds=25, 
       verbose=50
       )
       #target=model.predict_proba(unlabeled)
       #all_train=pd.concat([
       #    unlabeled])
       #model=model.fit(all_train, y)
       
        
       preds.append(model)
       cv=metrics.average_precision_score(train[train['Id'].isin(val_ids)][label], model.predict(train[train['Id'].isin(val_ids)][cols]))
       cvs.append(cv)
       print(np.mean(cvs, axis=0))
    #preds=np.mean(preds, axis=0)
    print(cvs)
    del train
    gc.collect()
    return preds

In [23]:
#cols=['Time_frac', 'Accv_change', 'AccML_change','AccAP_change', 'peaks_AccV', 'peaks_AccML', 'peaks_AccAP', 'lag_AccV', 'lag_AccAP', 'lag_AccML']

In [24]:
def get_prediction(data):
        preds1= train_model(data, 'StartHesitation'),
        preds2= train_model(data, 'Turn'),
        preds3=train_model(data, 'Walking')
        return preds1, preds2, preds3

In [25]:
def get_data(train_data=True, include_unlalebed=False):
    if train_data:
        df=pd.concat([reader(f, all=True, train=True) for f in tqdm(train)]).fillna(na)
        if include_unlalebed:
            df2= pd.concat([reader(f, all=True, train=True) for f in tqdm(train)]).fillna(na)
            df=pd.concat([df, df2])
        return df
    else:
        df= pd.concat([reader(f, all=True, train=False) for f in tqdm(test)]).fillna(na)
        return df


In [26]:
preds1, preds2, preds3=get_prediction(get_data())

  0%|          | 0/970 [00:00<?, ?it/s]

Memory usage of dataframe is 7068.46 MB
Memory usage became:  2866.6959228515625  MB
fold is 1 of 15
0.3641101154934483
fold is 2 of 15
0.1869910543544282
fold is 3 of 15
0.1286337189998686
fold is 4 of 15
0.09749762546797398
fold is 5 of 15
0.07964230942532288
[0.3641101154934483, 0.0098719932154081, 0.011919048290749395, 0.004089344872290105, 0.008221045254718453]
Memory usage of dataframe is 7068.46 MB
Memory usage became:  2866.6959228515625  MB
fold is 1 of 15
0.6401123551992335
fold is 2 of 15
0.5827845306808914
fold is 3 of 15
[50]	valid_0's l2: 0.225021	valid_0's average_precision: 0.633854
0.6068044245579816
fold is 4 of 15
0.6057165904378587
fold is 5 of 15
[50]	valid_0's l2: 0.12086	valid_0's average_precision: 0.366948
0.5617389091654112
[0.6401123551992335, 0.5254567061625495, 0.6548442123121617, 0.6024530880774902, 0.38582818407562114]
Memory usage of dataframe is 7068.46 MB
Memory usage became:  2866.6959228515625  MB
fold is 1 of 15
0.018065679802919513
fold is 2 of 15


In [27]:
def read_parquet(f):
    df = pd.read_parquet(f) 
    df['Id']=f.split('/')[-1].split('.')[0]
    df['Module'] = 'defog'
    df['Accv_change_1']=abs((df['AccV']-df['AccV'].shift(1)))
    df['AccML_change_1']=abs((df['AccML']-df['AccML'].shift(1)))
    df['AccAP_change_1']=abs((df['AccAP']-df['AccAP'].shift(1)))
    df['all_change']=(abs(df[['Accv_change_1', 'AccML_change_1', 'AccAP_change_1']])).sum(axis=1)
    df['AccV']=abs(df['AccV'])
    df['AccML']=abs(df['AccML'])
    df['AccAP']=abs(df['AccAP'])
    v=sem_supervised(df)
    return v
    

In [28]:
#org_cols=['AccV', 'AccML', 'AccAP', 'Time']

In [29]:
lst_models=[preds1[0], preds2[0], preds3]
df=get_data(train_data=False)
df=reduce_memory_usage(df)
df['Module']=df['Module'].map(meta_dict)
cols = [c for c in df.columns if c not in ['Id','Subject', 'Time', 'StartHesitation', 'Turn' , 'Walking', 'Valid', 'Task','Event']]
cols=df[cols].select_dtypes('number').columns
all_preds=[]
for regs in lst_models:
    res_vals=[]
    for i_fold in range(5):
        res_val=regs[i_fold].predict(df[cols])
        res_vals.append(res_val)
    res_vals=np.mean(res_vals, axis=0)
    all_preds.append(res_vals)
    


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage of dataframe is 89.58 MB
Memory usage became:  39.32728958129883  MB


In [30]:

submission=pd.DataFrame({ 'Id':df['Id'].astype(str) + '_' + df.index.astype(str),
              'StartHesitation': all_preds[0], 
              'Turn': all_preds[1], 
              'Walking':all_preds[2] })
submission = pd.merge(sub[['Id']], submission, how='left', on='Id').fillna(0.0)
submission.to_csv('submission.csv', index=False)

## Predict for test

In [31]:
submission.to_csv('submission.csv', index=False)
submission


Unnamed: 0,Id,StartHesitation,Turn,Walking
0,003f117e14_0,0.031643,-0.000238,0.011173
1,003f117e14_1,0.031643,-0.000238,0.011173
2,003f117e14_2,0.031643,-0.000238,0.011173
3,003f117e14_3,0.031643,-0.000238,0.011173
4,003f117e14_4,0.031643,-0.000238,0.011173
...,...,...,...,...
286365,02ab235146_281683,0.011291,0.033544,0.013494
286366,02ab235146_281684,0.011291,0.033544,0.013494
286367,02ab235146_281685,0.011291,0.033544,0.013494
286368,02ab235146_281686,0.011291,0.033544,0.013494
