In [24]:
import pyarrow
import pandas as pd

def read_input(path):
    return pd.read_feather(path)


import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer


def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in

def get_ct_feature_names(ct):

    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features

ic_summary = read_input('/Users/vnk/Downloads/Algorithms_Challenge/interconnect_roi_summary.feather')
ic_time_Series = read_input('/Users/vnk/Downloads/Algorithms_Challenge/interconnect_roi_timeseries.feather')

In [28]:
 ic_summary

Unnamed: 0,filename,roi_id,cell_group,roi_side,polarity,delta,result,failure_type
0,Rec-000013.ats,1-A-left,1-A,left,pos,355.550000,pass,
1,Rec-000013.ats,1-A-center,1-A,center,neg,452.953125,pass,
2,Rec-000013.ats,1-A-right,1-A,right,pos,520.587500,pass,
3,Rec-000013.ats,1-B-left,1-B,left,pos,1105.310000,pass,
4,Rec-000013.ats,1-B-center,1-B,center,neg,378.309375,pass,
...,...,...,...,...,...,...,...,...
5827,SM05326_LOW_WELD_TEST1 L2.ats,18-K-center,18-K,center,neg,125.312500,pass,
5828,SM05326_LOW_WELD_TEST1 L2.ats,18-K-right,18-K,right,pos,123.255000,pass,
5829,SM05326_LOW_WELD_TEST1 L2.ats,18-L-left,18-L,left,pos,162.072500,pass,
5830,SM05326_LOW_WELD_TEST1 L2.ats,18-L-center,18-L,center,neg,162.143750,pass,


In [29]:
ic_summary['failure_type'].isna().sum()

5808

In [30]:
ic_summary['failure_type'].value_counts()

hot     17
cold     7
Name: failure_type, dtype: int64

In [31]:
ic_time_Series

Unnamed: 0,filename,frame_number,roi_id,cell_group,roi_side,polarity,mean_pixel_value
0,Rec-000013.ats,1,1-A-left,1-A,left,pos,7695.830000
1,Rec-000013.ats,1,1-A-center,1-A,center,neg,7649.421875
2,Rec-000013.ats,1,1-A-right,1-A,right,pos,7610.145000
3,Rec-000013.ats,1,1-B-left,1-B,left,pos,7661.817500
4,Rec-000013.ats,1,1-B-center,1-B,center,neg,7616.990625
...,...,...,...,...,...,...,...
1122331,SM05326_LOW_WELD_TEST1 L2.ats,111,18-K-center,18-K,center,neg,4863.853125
1122332,SM05326_LOW_WELD_TEST1 L2.ats,111,18-K-right,18-K,right,pos,4895.822500
1122333,SM05326_LOW_WELD_TEST1 L2.ats,111,18-L-left,18-L,left,pos,4973.267500
1122334,SM05326_LOW_WELD_TEST1 L2.ats,111,18-L-center,18-L,center,neg,5047.671875


In [32]:
ic_time_Series[(ic_time_Series['filename']=='Rec-000013.ats') & (ic_time_Series['roi_id']=='1-A-left')]

Unnamed: 0,filename,frame_number,roi_id,cell_group,roi_side,polarity,mean_pixel_value
0,Rec-000013.ats,1,1-A-left,1-A,left,pos,7695.8300
648,Rec-000013.ats,2,1-A-left,1-A,left,pos,7695.9050
1296,Rec-000013.ats,3,1-A-left,1-A,left,pos,7695.1800
1944,Rec-000013.ats,4,1-A-left,1-A,left,pos,7695.2675
2592,Rec-000013.ats,5,1-A-left,1-A,left,pos,7695.1000
...,...,...,...,...,...,...,...
169128,Rec-000013.ats,262,1-A-left,1-A,left,pos,8044.1300
169776,Rec-000013.ats,263,1-A-left,1-A,left,pos,8045.8250
170424,Rec-000013.ats,264,1-A-left,1-A,left,pos,8046.2025
171072,Rec-000013.ats,265,1-A-left,1-A,left,pos,8047.3050


In [33]:
ic_pixel_stats = ic_time_Series.groupby(['filename','roi_id'])['mean_pixel_value'].agg(['mean', 'min', 'max']).reset_index()
ic_pixel_stats

Unnamed: 0,filename,roi_id,mean,min,max
0,Rec-000008.ats,1-A-center,7933.512168,7723.878125,8257.512500
1,Rec-000008.ats,1-A-left,7968.288913,7768.455000,8290.990000
2,Rec-000008.ats,1-A-right,7933.090085,7678.012500,8325.817500
3,Rec-000008.ats,1-B-center,7974.438904,7710.450000,8393.971875
4,Rec-000008.ats,1-B-left,8269.168768,7737.327500,9180.202500
...,...,...,...,...,...
5827,SM05334_ HIGH_WELD_TEST2 L2.ats,9-K-left,5117.541448,5042.880000,5185.680000
5828,SM05334_ HIGH_WELD_TEST2 L2.ats,9-K-right,5159.873571,5081.945000,5227.600000
5829,SM05334_ HIGH_WELD_TEST2 L2.ats,9-L-center,5569.356672,5517.350000,5618.493750
5830,SM05334_ HIGH_WELD_TEST2 L2.ats,9-L-left,5396.543472,5321.217500,5464.030000


In [34]:
ic = ic_summary.merge(ic_pixel_stats, on = ['filename','roi_id'], how = 'inner')
ic['failure_type'] = ic['failure_type'].fillna('just_right')
ic[['cell_group_column','cell_group_row']] = ic.cell_group.str.split('-',expand=True)

In [35]:
features = ic[['cell_group_column','cell_group_row','roi_side','polarity','delta','mean','min','max']]

In [36]:
features.dtypes

cell_group_column     object
cell_group_row        object
roi_side              object
polarity              object
delta                float64
mean                 float64
min                  float64
max                  float64
dtype: object

In [37]:
Y = ic['failure_type']
Y.value_counts()

just_right    5808
hot             17
cold             7
Name: failure_type, dtype: int64

In [38]:
import numpy as np
from sklearn.pipeline import Pipeline


In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.20, random_state=42)



In [40]:
y_train.shape

(4665,)

In [41]:
numeric_features = x_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = x_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='first'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder="drop")


In [42]:


preprocessor_fit = preprocessor.fit(x_train)
data = preprocessor_fit.transform(x_train)

In [43]:
test_data = preprocessor_fit.transform(x_test)

In [44]:
test_data.shape

(1167, 35)

In [57]:
from sklearn.linear_model import LogisticRegression
class_weight={'cold':0.4,'hot':0.5, 'just_right':0.1}

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight = 'balanced')
model.fit(data,y_train)

LogisticRegression(class_weight='balanced', multi_class='multinomial')

In [66]:
from sklearn.model_selection import GridSearchCV
class_weight1={'cold':0.4,'hot':0.5, 'just_right':0.1}
class_weight2={'cold':0.4,'hot':0.4, 'just_right':0.2}


parameters = {'class_weight':(class_weight1, class_weight2)}
model = LogisticRegression(multi_class='multinomial')

model = GridSearchCV(model, parameters, cv =5)
model.fit(data,y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(multi_class='multinomial'),
             param_grid={'class_weight': ({'cold': 0.4, 'hot': 0.5,
                                           'just_right': 0.1},
                                          {'cold': 0.4, 'hot': 0.4,
                                           'just_right': 0.2})})

In [70]:
model.cv_results_

{'mean_fit_time': array([0.05317578, 0.04578509]),
 'std_fit_time': array([0.0121176 , 0.00217317]),
 'mean_score_time': array([0.00079141, 0.0006916 ]),
 'std_score_time': array([1.27584160e-04, 2.72053882e-05]),
 'param_class_weight': masked_array(data=[{'cold': 0.4, 'hot': 0.5, 'just_right': 0.1},
                    {'cold': 0.4, 'hot': 0.4, 'just_right': 0.2}],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'class_weight': {'cold': 0.4, 'hot': 0.5, 'just_right': 0.1}},
  {'class_weight': {'cold': 0.4, 'hot': 0.4, 'just_right': 0.2}}],
 'split0_test_score': array([0.99571275, 0.99571275]),
 'split1_test_score': array([0.99678457, 0.99678457]),
 'split2_test_score': array([0.99571275, 0.99571275]),
 'split3_test_score': array([0.99678457, 0.99678457]),
 'split4_test_score': array([0.98928189, 0.99464094]),
 'mean_test_score': array([0.99485531, 0.99592712]),
 'std_test_score': array([0.00282763, 0.00080207]),
 'rank_test_score': a

In [51]:
y_predict = model.predict(test_data)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

        cold       0.00      0.00      0.00         1
         hot       0.00      0.00      0.00         2
  just_right       1.00      1.00      1.00      1164

    accuracy                           1.00      1167
   macro avg       0.33      0.33      0.33      1167
weighted avg       0.99      1.00      1.00      1167



  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
ic['cell_group'].nunique()

216

In [11]:
ic_time_Series.filename.unique()

array(['Rec-000013.ats', 'Rec-000012.ats', 'Rec-000010.ats',
       'Rec-000011.ats', 'Rec-000008.ats', 'Rec-000009.ats',
       'SM05334_ HIGH_WELD_TEST2 L2.ats',
       'SM05330_ HIGH_WELD_TEST2 L2.ats', 'SM05326_LOW_WELD_TEST1 L2.ats'],
      dtype=object)

In [4]:
ic_summary.failure_type.value_counts()

hot     17
cold     7
Name: failure_type, dtype: int64

In [5]:
ic_summary[ic_summary['result']=='fail']

Unnamed: 0,filename,roi_id,cell_group,roi_side,polarity,delta,result,failure_type
163,Rec-000013.ats,5-G-center,5-G,center,neg,382.928125,fail,hot
200,Rec-000013.ats,6-G-right,6-G,right,pos,31.6225,fail,cold
226,Rec-000013.ats,7-D-center,7-D,center,neg,493.690625,fail,hot
3888,SM05334_ HIGH_WELD_TEST2 L2.ats,1-A-left,1-A,left,pos,40.3275,fail,cold
3889,SM05334_ HIGH_WELD_TEST2 L2.ats,1-A-center,1-A,center,neg,304.4625,fail,hot
3892,SM05334_ HIGH_WELD_TEST2 L2.ats,1-B-center,1-B,center,neg,431.909375,fail,hot
4536,SM05330_ HIGH_WELD_TEST2 L2.ats,1-A-left,1-A,left,pos,30.32,fail,cold
4540,SM05330_ HIGH_WELD_TEST2 L2.ats,1-B-center,1-B,center,neg,1160.465625,fail,hot
4541,SM05330_ HIGH_WELD_TEST2 L2.ats,1-B-right,1-B,right,pos,37.49,fail,cold
4543,SM05330_ HIGH_WELD_TEST2 L2.ats,1-C-center,1-C,center,neg,378.728125,fail,hot


In [8]:
ic_summary['failure_type'].isna().sum()

5808

In [12]:
ic_summary

Unnamed: 0,filename,roi_id,cell_group,roi_side,polarity,delta,result,failure_type
0,Rec-000013.ats,1-A-left,1-A,left,pos,355.550000,pass,
1,Rec-000013.ats,1-A-center,1-A,center,neg,452.953125,pass,
2,Rec-000013.ats,1-A-right,1-A,right,pos,520.587500,pass,
3,Rec-000013.ats,1-B-left,1-B,left,pos,1105.310000,pass,
4,Rec-000013.ats,1-B-center,1-B,center,neg,378.309375,pass,
...,...,...,...,...,...,...,...,...
5827,SM05326_LOW_WELD_TEST1 L2.ats,18-K-center,18-K,center,neg,125.312500,pass,
5828,SM05326_LOW_WELD_TEST1 L2.ats,18-K-right,18-K,right,pos,123.255000,pass,
5829,SM05326_LOW_WELD_TEST1 L2.ats,18-L-left,18-L,left,pos,162.072500,pass,
5830,SM05326_LOW_WELD_TEST1 L2.ats,18-L-center,18-L,center,neg,162.143750,pass,
