In [3]:
import pandas as pd
import numpy as np
import os

# Statistical libraries
from scipy import stats
from numpy import percentile

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import MinMaxScaler

# Evaluation Procedures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Classification methods
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.ensemble import VotingClassifier

# Evaluation Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [4]:
def print_all (to_print):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(to_print)

In [5]:
def print_his(dat0a, bins):
    hist1 = sns.distplot(data,bins=bins)

In [6]:
def print_clean_col_std(df):
    mean, std = df.mean() , df.std()
    lower, upper = mean - 3*std, mean + 3*std
    all_outliers = 0
    i=0
    for col in df.columns:
        outliers = ((df[col]>upper[col]) | (df[col]<lower[col])).astype(int)
        
        if (outliers.sum()>0):
            print("Variable %s (%.3f,%.3f) has %d outliers"%(col,mean[col],std[col],outliers.sum()))
            all_outliers +=outliers.sum()
        else:
            i+=1
    print('the number of cleaned columns are ', i)
    print('the number of all outliers is ', all_outliers)

In [7]:
def print_clean_col_iqr(df):
    X = df.copy()
    all_outliers=0
    sorted(X)
    # calculate interquartile range
    q25, q75 = X.quantile(0.25), X.quantile(0.75)
    # q25, q75 = np.percentile(X , [25,75])
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    i=0
    for col in X.columns:
        outliers = ((X[col]>upper[col]) | (X[col]<lower[col])).astype(int)
        
        if (outliers.sum()>0):
            print("Variable %s has %d outliers"%(col,outliers.sum()))
            all_outliers +=outliers.sum()
            
        else:
            i+=1
    print('the number of cleaned columns are ', i)
    print('the number of all outliers is ', all_outliers)


In [8]:
# Finding and replace outliers with standard deviation Method 
def replace_outliers_std(transform_df , original_df):
    X =transform_df.copy()
    mean, std = X.mean() , X.std()

    # identify outliers
    cut_off = std * 3
    lower, upper = mean - cut_off, mean + cut_off
    for col in X.columns:
        original_df.loc[X[col]>upper[col], col] = upper[col]
        original_df.loc[X[col]<lower[col], col] = lower[col]

    return original_df

In [9]:
# Finding and replace outliers with Interquartile Range, or IQR  Method 
def replace_outliers_iqr(transform_df , original_df):
    X =transform_df.copy()
    # calculate interquartile range
    q25, q75 = X.quantile(0.25), X.quantile(0.75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = q25 - cut_off, q75 + cut_off
    for col in X.columns:
      original_df.loc[X[col]>upper[col], col] = upper[col]
      original_df.loc[X[col]<lower[col], col] = lower[col]

    return original_df

In [10]:
# sort Numerical values function
def sorted_m3_numerical_values(numerical_df, reverse = True):
    sorted_Numerical_df = numerical_df.reindex(sorted(numerical_df.columns, reverse=reverse), axis=1)
    sorted_Numerical_columns_list = (list(sorted_Numerical_df.columns))
    New_Numerical_columns_list = []
    for column in sorted_Numerical_columns_list:
        New_column = column.replace('_',' ')
        New_Numerical_columns_list.append(New_column)

    New_Numerical_columns_list.sort(key=lambda s: s.split()[1])

    sorted_Numerical_columns_list.clear()
    for column in New_Numerical_columns_list:
        New_column = column.replace(' ','_')
        sorted_Numerical_columns_list.append(New_column)

    #drop the dataframe
    sorted_Numerical_df = sorted_Numerical_df.iloc[0:0]

    sorted_Numerical_df = numerical_df[sorted_Numerical_columns_list]
    return sorted_Numerical_df


In [11]:
# Loading the DATA  for Daniel, 
#Don't use it if you don't want to mount your google drive with google colab
from google.colab import drive
drive.mount('/content/drive') #,force_remount=Truimport os 
cwd = os.getcwd()
dataset_dir = os.path.join(cwd, 'DMTMDataset')
#train_df=pd.read_csv(os.path.join(dataset_dir,'train.csv'))
dataset_direction='/content/drive/MyDrive/'
# !unzip ('/content/drive/MyDrive/')
train_df=open(os.path.join(dataset_direction,'train.csv'))
df=pd.read_csv((train_df))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df.describe(include='all')

In [None]:
# df = pd.read_csv('train.csv')

In [None]:
df.hist(figsize = (60,60))

In [12]:
#Sort alarm's correlated columns 
sorted_sum_alarm_df = df.loc[: , 'equipment_sum_alarms_prev14d':'temperature_sum_alarms_prev3d']
sorted_sum_alarm_df = sorted_sum_alarm_df.reindex(sorted(sorted_sum_alarm_df.columns, reverse=True), axis=1)## alarms

#Sort persistance's correlated columns
sorted_persistance_df = (df.loc[:, 'equipment_max_persistance_prev7d': 'temperature_min_persistance_prev3d'])
sorted_persistance_df = sorted_persistance_df.reindex(sorted(sorted_persistance_df.columns, reverse=True), axis=1)## persistance


#sort Numerical's  columns 
sorted_numerical_df = df.loc[: , 'mean_temperature_prev7d':'min_pressure_f_next7d']## Numerical
sorted_numerical_df = sorted_m3_numerical_values(sorted_numerical_df) 

#Skewness and Kurt
skew_cat_alarms_prev14d = df.loc[:,'skew_equipment_alarms_prev14d': 'skew_temperature_alarms_prev14d']
kurt_cat_alarms_prev14d = df.loc[:,'kurt_equipment_alarms_prev14d': 'kurt_temperature_alarms_prev14d']
skwurt = pd.concat([skew_cat_alarms_prev14d,kurt_cat_alarms_prev14d],axis=1)### Skewness and kurt together

In [13]:
# Concatinate all numerical data
all_numerical_df = pd.concat([sorted_numerical_df, sorted_sum_alarm_df,sorted_persistance_df,skwurt],axis=1)

In [15]:
mean, std = all_numerical_df.mean() , all_numerical_df.std()
lower, upper = mean - 3*std, mean + 3*std
print(lower)


min_humidity_prev7d                20.313686
min_humidity_prev3d                22.240948
min_humidity_f_next7d              20.227553
min_humidity_f_next14d             19.172805
mean_humidity_prev7d               31.606389
                                     ...    
kurt_equipment_alarms_prev14d      -2.313809
kurt_fire/smoke_alarms_prev14d     -1.430983
kurt_ge_alarms_prev14d             -1.683278
kurt_power_alarms_prev14d          -4.136532
kurt_temperature_alarms_prev14d    -1.697487
Length: 115, dtype: float64


In [16]:
print(upper)

min_humidity_prev7d                 93.101467
min_humidity_prev3d                100.261054
min_humidity_f_next7d               93.305703
min_humidity_f_next14d              87.603129
mean_humidity_prev7d               101.686492
                                      ...    
kurt_equipment_alarms_prev14d       -0.061152
kurt_fire/smoke_alarms_prev14d      -0.983148
kurt_ge_alarms_prev14d              -0.727987
kurt_power_alarms_prev14d            1.817955
kurt_temperature_alarms_prev14d     -0.695920
Length: 115, dtype: float64


In [None]:
sorted_sum_alarm_df.hist(figsize = (60,60))
sorted_persistance_df.hist(figsize = (60,60))
skwurt.hist(figsize = (60,60))

In [None]:
#check how many columns includes outlier with std method
print_clean_col_std(all_numerical_df)

In [None]:
#check how many columns includes outlier with iqr method
print_clean_col_iqr(all_numerical_df)

In [18]:
# transform original data
transform_all_numerical_df_np = power_transform(all_numerical_df, method = 'yeo-johnson', standardize=False )

after the transformation we get a numpy array we need to convert it again to a data frame

In [22]:
transform_all_numerical_df = pd.DataFrame(transform_all_numerical_df_np, columns = all_numerical_df.columns)

In [23]:
mean, std = transform_all_numerical_df.mean() , transform_all_numerical_df.std()
lower, upper = mean - 3*std, mean + 3*std
print(lower)


min_humidity_prev7d                 25.558898
min_humidity_prev3d                 31.934455
min_humidity_f_next7d               25.869959
min_humidity_f_next14d              22.466496
mean_humidity_prev7d              -120.475175
                                      ...    
kurt_equipment_alarms_prev14d     -139.724929
kurt_fire/smoke_alarms_prev14d    -718.219408
kurt_ge_alarms_prev14d            -896.558514
kurt_power_alarms_prev14d         -243.114661
kurt_temperature_alarms_prev14d   -146.167063
Length: 115, dtype: float64


In [24]:
print(upper)

min_humidity_prev7d                 568.182152
min_humidity_prev3d                 573.097219
min_humidity_f_next7d               535.696723
min_humidity_f_next14d              563.119699
mean_humidity_prev7d               8616.514649
                                      ...     
kurt_equipment_alarms_prev14d       -31.915265
kurt_fire/smoke_alarms_prev14d     -318.835115
kurt_ge_alarms_prev14d             -335.001100
kurt_power_alarms_prev14d           -35.541677
kurt_temperature_alarms_prev14d     -36.251730
Length: 115, dtype: float64


In [None]:
transform_all_numerical_df.hist(figsize=(60,60))

In [None]:
# all_numerical_df_mean = all_numerical_df.mean()
# all_numerical_df_std = all_numerical_df.std()

In [60]:
##replace outliers with std method
all_numerical_df_without_outlier = replace_outliers_std(transform_all_numerical_df, all_numerical_df)

In [None]:
#check if there is any outlier afer executing the function
print_clean_col_std(all_numerical_df_without_outlier)

In [None]:
##replace outliers with iqr method
all_numerical_df_without_outlier = replace_outliers_iqr(transform_all_numerical_df, all_numerical_df)

In [None]:
print_clean_col(all_numerical_df_np)

the number of cleaned columns are  115


In [None]:
#Sort alarm's correlated columns 
sorted_sum_alarm_df = all_numerical_df.loc[: , 'temperature_sum_alarms_prev7d':'equipment_sum_alarms_prev14d' ]
# sorted_sum_alarm_df = sorted_sum_alarm_df.reindex(sorted(sorted_sum_alarm_df.columns, reverse=False), axis=1)## alarms

#Sort persistance's correlated columns
sorted_persistance_df = all_numerical_df.loc[:, 'temperature_min_persistance_prev7d': 'equipment_max_persistance_prev3d']
# sorted_persistance_df = sorted_persistance_df.reindex(sorted(sorted_persistance_df.columns, reverse=False), axis=1)## persistance


#sort Numerical's  columns 
sorted_numerical_df = all_numerical_df.loc[: , 'min_humidity_prev7d':'max_wind_speed_f_next14d']## Numerical
# sorted_numerical_df = sorted_m3_numerical_values(sorted_numerical_df, False) 

#Skewness and Kurt
skew_cat_alarms_prev14d = all_numerical_df.loc[:,'skew_equipment_alarms_prev14d': 'skew_temperature_alarms_prev14d']
kurt_cat_alarms_prev14d = all_numerical_df.loc[:,'kurt_equipment_alarms_prev14d': 'kurt_temperature_alarms_prev14d']
skwurt = pd.concat([skew_cat_alarms_prev14d,kurt_cat_alarms_prev14d],axis=1)### Skewness and kurt together