In [79]:
import pandas as pd 
import zipfile
import pyarrow

### loading the data

# zip file containing all data
zip_path = 'data.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # extract all the contents into the current directory
    zip_ref.extractall()

# loading the data file
df_pressure = pd.read_parquet('./data.parquet')
# check if it's loaded correctly
print(df_pressure.head())
print(df_pressure.info())
print(df_pressure.describe())

# loading the labels file
df_labels = pd.read_csv('./labels.csv')
# check if it's loaded correctly
print(df_labels.head())
print(df_labels.describe())

  MachineId  MeasurementId  Pressure
0     0_0_0              0       0.0
1     0_0_0              0       0.0
2     0_0_0              0       0.0
3     0_0_0              0       0.0
4     0_0_0              0       0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18466364 entries, 0 to 18466363
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   MachineId      object 
 1   MeasurementId  int64  
 2   Pressure       float64
dtypes: float64(1), int64(1), object(1)
memory usage: 422.7+ MB
None
       MeasurementId      Pressure
count   1.846636e+07  1.846636e+07
mean    3.398397e+03  3.338926e-01
std     3.019365e+03  4.862787e-01
min    -1.000000e+00  0.000000e+00
25%    -1.000000e+00  0.000000e+00
50%     3.179000e+03  0.000000e+00
75%     6.277000e+03  7.017707e-01
max     8.834000e+03  6.774464e+00
  MachineId  MeasurementId PumpFailed SlowStart SlowEnd
0     0_0_0              0      False     False   False
1     0_0_1             -1  

In [90]:
# encoding MachineId as categorical variable
df_pressure['MachineId'] = df_pressure['MachineId'].astype('category').cat.codes
df_labels['MachineId'] = df_labels['MachineId'].astype('category').cat.codes
print(df_pressure.info())
print(df_labels.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18466364 entries, 0 to 18466363
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   MachineId      int16  
 1   MeasurementId  int64  
 2   Pressure       float64
dtypes: float64(1), int16(1), int64(1)
memory usage: 317.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27385 entries, 0 to 27384
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MachineId      27385 non-null  int16 
 1   MeasurementId  27385 non-null  int64 
 2   PumpFailed     26900 non-null  object
 3   SlowStart      19300 non-null  object
 4   SlowEnd        19300 non-null  object
dtypes: int16(1), int64(1), object(3)
memory usage: 909.4+ KB
None


In [91]:
import numpy as np

# fucntion to create features for each measured cycle
def create_features(group):
    half = len(group) // 2
    first_half = group.iloc[:half]
    second_half = group.iloc[half:]

    # basic statistical features
    features = {
        'Pressure_mean_first_half': first_half['Pressure'].mean(),
        'Pressure_mean_second_half': second_half['Pressure'].mean(),
        'Pressure_min_first_half': first_half['Pressure'].min(),
        'Pressure_min_second_half': second_half['Pressure'].min(),
    }

    # slope of pressure change - first half of the cycle
    slope, _ = np.polyfit(range(half), first_half['Pressure'], 1)
    features['Pressure_slope_first_half'] = slope

    # adjusting the range for the second half
    second_half_range = range(half, half + len(second_half))
    slope, _ = np.polyfit(second_half_range, second_half['Pressure'], 1)
    features['Pressure_slope_second_half'] = slope

    return pd.Series(features)


In [92]:
# aggregate features
df_features = df_pressure.groupby(['MachineId', 'MeasurementId']).apply(create_features).reset_index()
# check results
print(df_features.head())
print(df_labels.head())
# merge the dataframes
df_merged = pd.merge(df_features, df_labels, on=['MachineId', 'MeasurementId'], how='inner')
# check
print(df_merged.head())

   MachineId  MeasurementId  Pressure_mean_first_half  \
0          0             -1                  0.230289   
1          0              0                  0.605873   
2          0            215                  0.710727   
3          0            237                  0.977540   
4          0            353                  0.344599   

   Pressure_mean_second_half  Pressure_min_first_half  \
0                   0.187364                      0.0   
1                   0.144275                      0.0   
2                   0.000000                      0.0   
3                   0.023251                      0.0   
4                   0.615127                      0.0   

   Pressure_min_second_half  Pressure_slope_first_half  \
0                       0.0              -4.946711e-07   
1                       0.0               9.104899e-03   
2                       0.0              -7.884998e-04   
3                       0.0               6.132328e-03   
4                       

In [93]:
# remove rows where PumpFailed is NaN
df_cleaned = df_merged.dropna(subset=['PumpFailed']).copy()

# check the cleaned dataframe
# print(df_cleaned.describe())
# print(df_cleaned.info())
print(df_cleaned.dtypes)

# convert PumpFailed to boolean
df_cleaned['PumpFailed'] = df_cleaned['PumpFailed'].astype(int)

# check data types
print(df_cleaned.dtypes)
print(df_cleaned['PumpFailed'].unique())

MachineId                       int16
MeasurementId                   int64
Pressure_mean_first_half      float64
Pressure_mean_second_half     float64
Pressure_min_first_half       float64
Pressure_min_second_half      float64
Pressure_slope_first_half     float64
Pressure_slope_second_half    float64
PumpFailed                     object
SlowStart                      object
SlowEnd                        object
dtype: object
MachineId                       int16
MeasurementId                   int64
Pressure_mean_first_half      float64
Pressure_mean_second_half     float64
Pressure_min_first_half       float64
Pressure_min_second_half      float64
Pressure_slope_first_half     float64
Pressure_slope_second_half    float64
PumpFailed                      int64
SlowStart                      object
SlowEnd                        object
dtype: object
[0 1]


In [94]:
# check for missing values in each column
missing_values = df_cleaned.isna().sum()
print(missing_values)

# drop rows where either SlowStart or SlowEnd have missing values
df_cleaned_no_missing = df_cleaned.dropna(subset=['SlowStart', 'SlowEnd']).copy()

# convert SlowStart and SlowEnd to boolean
df_cleaned_no_missing['SlowStart'] = df_cleaned_no_missing['SlowStart'].astype(int)
df_cleaned_no_missing['SlowEnd'] = df_cleaned_no_missing['SlowEnd'].astype(int)

# check the result
print(df_cleaned_no_missing.isna().sum())
print(df_cleaned_no_missing.info())

MachineId                        0
MeasurementId                    0
Pressure_mean_first_half         0
Pressure_mean_second_half        0
Pressure_min_first_half          0
Pressure_min_second_half         0
Pressure_slope_first_half        0
Pressure_slope_second_half       0
PumpFailed                       0
SlowStart                     7600
SlowEnd                       7600
dtype: int64
MachineId                     0
MeasurementId                 0
Pressure_mean_first_half      0
Pressure_mean_second_half     0
Pressure_min_first_half       0
Pressure_min_second_half      0
Pressure_slope_first_half     0
Pressure_slope_second_half    0
PumpFailed                    0
SlowStart                     0
SlowEnd                       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 19300 entries, 1 to 27384
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   MachineId     

In [95]:
from sklearn.model_selection import train_test_split

# extracting features and the dependent variable PumpFailed
X = df_cleaned_no_missing.drop('PumpFailed', axis=1)  # features
y = df_cleaned_no_missing['PumpFailed']  # target

# splitting the data into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
import statsmodels.api as sm

# prepare the data
X = df_cleaned_no_missing.drop(['PumpFailed', 'MachineId', 'MeasurementId'], axis=1)  # features
y = df_cleaned_no_missing['PumpFailed'] # target

# add a constant to the feature set
X_const = sm.add_constant(X)

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_const, y, test_size=0.2, random_state=42)

In [97]:
# logistic regression model
model_sm = sm.Logit(y_train, X_train).fit()

# summary of the model
print(model_sm.summary())


         Current function value: 0.315053
         Iterations: 35


LinAlgError: Singular matrix

In [98]:
print(X.info())

print(X.corr())



<class 'pandas.core.frame.DataFrame'>
Index: 19300 entries, 1 to 27384
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Pressure_mean_first_half    19300 non-null  float64
 1   Pressure_mean_second_half   19300 non-null  float64
 2   Pressure_min_first_half     19300 non-null  float64
 3   Pressure_min_second_half    19300 non-null  float64
 4   Pressure_slope_first_half   19300 non-null  float64
 5   Pressure_slope_second_half  19300 non-null  float64
 6   SlowStart                   19300 non-null  int64  
 7   SlowEnd                     19300 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 1.3 MB
None
                            Pressure_mean_first_half  \
Pressure_mean_first_half                    1.000000   
Pressure_mean_second_half                   0.042976   
Pressure_min_first_half                          NaN   
Pressure_min_second_half                   -0.02