In [52]:
#importing the usual libraries
import numpy as np
import pandas as pd
#visuzliation 
import matplotlib.pyplot as plt
import seaborn as sns
#train_test_split
from sklearn.model_selection import train_test_split
#For scaling the dataset
from sklearn.preprocessing import StandardScaler
#importing logistic regression 
from sklearn.linear_model import LogisticRegression
#For checking the accuracy
from sklearn.metrics import confusion_matrix,classification_report

# Loading the Dataset

In [53]:
df=pd.read_csv('/kaggle/input/nyc-inspections/DOHMH_New_York_City_Restaurant_Inspection_Results.csv')
#showing the dataset
df

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
0,40511702,NOTARO RESTAURANT,MANHATTAN,635,SECOND AVENUE,10016.0,2126863400,Italian,06/15/2015,Violations were cited in the following area(s).,02B,Hot food item not held at or above 140Âº F.,Critical,30.0,,,08/28/2017,Cycle Inspection / Initial Inspection
1,40511702,NOTARO RESTAURANT,MANHATTAN,635,SECOND AVENUE,10016.0,2126863400,Italian,11/25/2014,Violations were cited in the following area(s).,20F,Current letter grade card not posted.,Not Critical,,,,08/28/2017,Administrative Miscellaneous / Initial Inspection
2,50046354,VITE BAR,QUEENS,2507,BROADWAY,11106.0,3478134702,Italian,10/03/2016,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,2.0,,,08/28/2017,Pre-permit (Operational) / Initial Inspection
3,50061389,TACK'S CHINESE TAKE OUT,STATEN ISLAND,11C,HOLDEN BLVD,10314.0,7189839854,Chinese,05/17/2017,Violations were cited in the following area(s).,02G,Cold food item held above 41Âº F (smoked fish ...,Critical,46.0,,,08/28/2017,Pre-permit (Operational) / Initial Inspection
4,41516263,NO QUARTER,BROOKLYN,8015,5 AVENUE,11209.0,7187019180,American,03/30/2017,Violations were cited in the following area(s).,04M,Live roaches present in facility's food and/or...,Critical,18.0,,,08/28/2017,Cycle Inspection / Initial Inspection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399913,41004284,CAFE CLASSICO,MANHATTAN,35,WEST 57 STREET,10019.0,2123555411,Jewish/Kosher,01/28/2016,Violations were cited in the following area(s).,04C,Food worker does not use proper utensil to eli...,Critical,19.0,,,08/28/2017,Cycle Inspection / Initial Inspection
399914,40368318,MAGGIES PLACE,MANHATTAN,21,EAST 47 STREET,10017.0,2127535757,American,02/29/2016,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,11.0,A,02/29/2016,08/28/2017,Cycle Inspection / Re-inspection
399915,50051468,Yong Kong Street,MANHATTAN,1000S,8TH AVE,10019.0,9176406083,Japanese,01/05/2017,Violations were cited in the following area(s).,10B,Plumbing not properly installed or maintained;...,Not Critical,25.0,,,08/28/2017,Pre-permit (Operational) / Initial Inspection
399916,41555297,RUBY FALLS NIGHTLIFE CAFE,STATEN ISLAND,45,PAGE AVENUE,10309.0,7189849888,American,09/19/2014,Violations were cited in the following area(s).,10I,"Single service item reused, improperly stored,...",Not Critical,12.0,A,09/19/2014,08/28/2017,Cycle Inspection / Re-inspection


# Checking for the Preliminary Information

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399918 entries, 0 to 399917
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CAMIS                  399918 non-null  int64  
 1   DBA                    399559 non-null  object 
 2   BORO                   399918 non-null  object 
 3   BUILDING               399809 non-null  object 
 4   STREET                 399909 non-null  object 
 5   ZIPCODE                399909 non-null  float64
 6   PHONE                  399913 non-null  object 
 7   CUISINE DESCRIPTION    399918 non-null  object 
 8   INSPECTION DATE        399918 non-null  object 
 9   ACTION                 398783 non-null  object 
 10  VIOLATION CODE         393414 non-null  object 
 11  VIOLATION DESCRIPTION  392939 non-null  object 
 12  CRITICAL FLAG          399918 non-null  object 
 13  SCORE                  376704 non-null  float64
 14  GRADE                  195413 non-nu

# Checking for Missing Values

In [55]:
df.isna().sum()

CAMIS                         0
DBA                         359
BORO                          0
BUILDING                    109
STREET                        9
ZIPCODE                       9
PHONE                         5
CUISINE DESCRIPTION           0
INSPECTION DATE               0
ACTION                     1135
VIOLATION CODE             6504
VIOLATION DESCRIPTION      6979
CRITICAL FLAG                 0
SCORE                     23214
GRADE                    204505
GRADE DATE               207098
RECORD DATE                   0
INSPECTION TYPE            1135
dtype: int64

In [68]:
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column])
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

# Creating Preprocessing Function Input

In [83]:
def preprocess_inputs(df):
    df=df.copy()
    #Dropping the Camis and Dba column
    df=df.drop(['CAMIS','DBA'],axis=1)
    #Dropping the Violation Description column
    df=df.drop('VIOLATION DESCRIPTION',axis=1)
    #Dropping the Record Date because of single value
    df=df.drop('RECORD DATE',axis=1)
    #Dropping column with high number of missing values
    high_missing=df.columns[df.isna().mean()>=0.25]
    df=df.drop(high_missing,axis=1)
    #Dropping the high_cardinality columns
    high_cardinality=['BUILDING','STREET','PHONE']
    df=df.drop(high_cardinality,axis=1)
    #Converting ZIPCODE to string column
    df['ZIPCODE']=df['ZIPCODE'].astype(str)
    #Dropping rows with missing target column
    not_applicable=df[df['CRITICAL FLAG']=='Not Applicable'].index
    df=df.drop(not_applicable,axis=0).reset_index(drop=True)
    #Filling the missing value with mean of that column SCORE
    df['SCORE']=df['SCORE'].fillna(df['SCORE'].mean())
    df['INSPECTION MONTH']=df['INSPECTION DATE'].apply(lambda x:x[3:5]).astype(np.int)
    df['INSPECTION YEAR']=df['INSPECTION DATE'].apply(lambda x:x[-4:]).astype(np.int)
    df=df.drop('INSPECTION DATE',axis=1)
    #onehot_encoding the remaining critical columns
    
    onehot_columns=['BORO', 'ZIPCODE', 'CUISINE DESCRIPTION', 'ACTION', 'VIOLATION CODE','INSPECTION TYPE']
    df=onehot_encode(df,onehot_columns)
    #encoding the label column
    df['CRITICAL FLAG']=df['CRITICAL FLAG'].apply(lambda x:0 if x=='Not Critical' else 1)
    
    #Spitting the feature columns and target columns
    y=df['CRITICAL FLAG']
    x=df.drop('CRITICAL FLAG',axis=1)
    #Scaling the dataframe with StandardScaler
    scaler=StandardScaler()
    x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.75)
    return x_train,x_test,y_train,y_test

In [74]:
x.select_dtypes('object').columns

Index(['BORO', 'ZIPCODE', 'CUISINE DESCRIPTION', 'ACTION', 'VIOLATION CODE',
       'CRITICAL FLAG', 'INSPECTION TYPE'],
      dtype='object')

In [84]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
#showing the dataset
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


(294704, 454)
(98235, 454)
(294704,)
(98235,)


# Training the Model

In [86]:
model=LogisticRegression()
model.fit(x_train,y_train)


LogisticRegression()

# Getting the Confusion Matrix

In [100]:
y_test=np.array(y_test)

In [98]:
y_pred=npmodel.predict(x_test)

In [99]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

# Getting the Score of the Trained Model

In [87]:
print("Model Accuracy:",model.score(x_test,y_test))

Model Accuracy: 1.0


# Checking for Unique Value in Object Column

In [58]:
{column:len(list(x[column].unique())) for column in x.columns if x[column].dtypes=='object'}

{'BORO': 5,
 'ZIPCODE': 230,
 'CUISINE DESCRIPTION': 84,
 'INSPECTION DATE': 1405,
 'ACTION': 5,
 'VIOLATION CODE': 93,
 'CRITICAL FLAG': 2,
 'INSPECTION TYPE': 34}

In [59]:
x['CRITICAL FLAG'].unique()

array(['Critical', 'Not Critical'], dtype=object)

In [60]:
df[df['CRITICAL FLAG']=='Not Applicable'].index

Int64Index([   147,    164,    214,    215,    227,    237,    269,    303,
               325,    429,
            ...
            399279, 399280, 399286, 399392, 399439, 399484, 399539, 399542,
            399836, 399864],
           dtype='int64', length=6979)

In [61]:
x.columns[x.isna().mean()>=0.25]

Index([], dtype='object')