In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator

In [2]:
sheet_name = '../data/train_set.csv'

In [3]:
df = pd.read_csv(sheet_name)

In [4]:
df.columns

Index(['Unnamed: 0', 'DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1',
       'DChar2', 'LChar2', 'DChar3', 'LChar3', 'DChar4', 'LChar4', 'PFr1',
       'PFr2', 'PFr3', 'PFr4', 'PFr6', 'DChar5', 'LChar5', 'PFr5'],
      dtype='object')

# Target feature

In [5]:
# to get rid of typos and unwanted strings
def convert_to_nan(x):
    try:
        return pd.to_numeric(x)
    except:
        return np.nan

In [6]:
scaler = MinMaxScaler()
imputer = SimpleImputer(strategy='mean')
def process_column(df, column_name, strategy):
    # Convert column values to NaN
    df[column_name] = df[column_name].apply(convert_to_nan)
    
    # Impute missing values using SimpleImputer
    df[column_name] = imputer.fit_transform(df[column_name].values.reshape(-1, 1))
    
    # Scale the resulting column using MinMaxScaler  
    df[column_name] = scaler.fit_transform(df[column_name].values.reshape(-1, 1))
    
    # Pickle the variables
    with open('scaler.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    with open('imputer.pkl', 'wb') as file:
        pickle.dump(imputer, file)

In [7]:
cols = ['PFr1', 'PFr2','PFr3', 'PFr4', 'PFr5', 'PFr6']

for i in cols:
    process_column(df, i, 'mean')

In [8]:
df['Sum'] = df[['PFr1', 'PFr2', 'PFr3', 'PFr4', 'PFr5', 'PFr6']].sum(axis=1)
df['Sum'] = scaler.fit_transform(df['Sum'].values.reshape(-1, 1))


# Input features

In [9]:
def fill_missing_with_zero(df, column_name):
    df.loc[:, column_name].fillna(0, inplace=True)

def convert_column_to_float32(df, column_name):
    df[column_name] = df[column_name].replace([np.inf, -np.inf], np.nan)
    df[column_name].fillna(0, inplace=True)
    df[column_name] = df[column_name].astype('float32')

In [10]:
cols = ['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'DChar2', 'LChar2', 'DChar3', 'LChar3', 'DChar4', 'LChar4']

for col in cols:
    df[col] = df[col].apply(convert_to_nan)
    

for c in cols:
    convert_column_to_float32(df, c)
    
for c in cols:
    df[c] = scaler.fit_transform(df[c].values.reshape(-1, 1))

# Creating multiple sets

In [11]:
df1 = df[['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'Sum']]

df2 = df[['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'DChar2', 'LChar2', 'Sum']]

df3 = df[['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'DChar2',
       'LChar2', 'DChar3', 'LChar3', 'Sum']]

df4 = df[['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'DChar2',
       'LChar2', 'DChar4', 'LChar4', 'Sum']]

df5 = df[['DTronc', 'LTronc', 'DChar', 'LChar', 'DChar1', 'LChar1', 'DChar2',
       'LChar2', 'DChar5', 'LChar5', 'Sum']]

In [13]:
new_sheets_names = ['../data/precoce_1_Char_mean.csv',
                   '../data/precoce_2_Char_mean.csv',
                   '../data/precoce_3_Char_mean.csv',
                   '../data/precoce_4_Char_mean.csv',
                    '../data/precoce_5_Char_mean.csv'
                   ]

In [14]:
df1.to_csv(new_sheets_names[0])
df2.to_csv(new_sheets_names[1])
df3.to_csv(new_sheets_names[2])
df4.to_csv(new_sheets_names[3])
df5.to_csv(new_sheets_names[4])