In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler

%matplotlib inline

In [2]:
train_df = pd.read_csv(r"./Criminal/criminal_train.csv", na_values="-1")
train_df.shape

(45718, 72)

In [3]:
train_df.head(2)

Unnamed: 0,PERID,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,25095143,4.0,2.0,4.0,1.0,3.0,1.0,1.0,1.0,99.0,...,1.0,2.0,1.0,1.0,2.0,2.0,3884.805998,40026.0,1.0,0
1,13005143,4.0,1.0,3.0,1.0,2.0,1.0,1.0,1.0,99.0,...,2.0,2.0,2.0,3.0,2.0,2.0,1627.108106,40015.0,2.0,1


In [4]:
train_df.isnull().sum()

PERID          0
IFATHER        2
NRCH17_2      84
IRHHSIZ2       2
IIHHSIZ2       2
IRKI17_2       2
IIKI17_2       2
IRHH65_2       2
IIHH65_2       2
PRXRETRY       2
PRXYDATA       2
MEDICARE       2
CAIDCHIP       2
CHAMPUS        2
PRVHLTIN       2
GRPHLTIN       2
HLTINNOS       2
HLCNOTYR       2
HLCNOTMO       2
HLCLAST        2
HLLOSRSN       2
HLNVCOST       2
HLNVOFFR       2
HLNVREF        2
HLNVNEED       2
HLNVSOR        2
IRMCDCHP       2
IIMCDCHP       2
IRMEDICR       2
IIMEDICR       2
            ... 
CELLNOTCL      2
CELLWRKNG      2
IRFAMSOC       2
IIFAMSOC       2
IRFAMSSI       2
IIFAMSSI       2
IRFSTAMP       2
IIFSTAMP       2
IRFAMPMT       2
IIFAMPMT       2
IRFAMSVC       2
IIFAMSVC       2
IRWELMOS       2
IIWELMOS       2
IRPINC3        2
IRFAMIN3       2
IIPINC3        2
IIFAMIN3       2
GOVTPROG       2
POVERTY3     343
TOOLONG        2
TROUBUND       2
PDEN10         2
COUTYP2        2
MAIIN102       2
AIIND102       2
ANALWT_C       2
VESTR         

In [5]:
# removing rows (2 rows) containing NaN. For these rows all the columns have NaN.
train_df = train_df[~(train_df["IFATHER"].isnull())]

In [6]:
y_train = train_df.iloc[:, -1]
train_df = train_df.iloc[:, 1:train_df.columns.shape[0]-1]

In [7]:
train_df.isnull().sum()

IFATHER        0
NRCH17_2      82
IRHHSIZ2       0
IIHHSIZ2       0
IRKI17_2       0
IIKI17_2       0
IRHH65_2       0
IIHH65_2       0
PRXRETRY       0
PRXYDATA       0
MEDICARE       0
CAIDCHIP       0
CHAMPUS        0
PRVHLTIN       0
GRPHLTIN       0
HLTINNOS       0
HLCNOTYR       0
HLCNOTMO       0
HLCLAST        0
HLLOSRSN       0
HLNVCOST       0
HLNVOFFR       0
HLNVREF        0
HLNVNEED       0
HLNVSOR        0
IRMCDCHP       0
IIMCDCHP       0
IRMEDICR       0
IIMEDICR       0
IRCHMPUS       0
            ... 
OTHINS         0
CELLNOTCL      0
CELLWRKNG      0
IRFAMSOC       0
IIFAMSOC       0
IRFAMSSI       0
IIFAMSSI       0
IRFSTAMP       0
IIFSTAMP       0
IRFAMPMT       0
IIFAMPMT       0
IRFAMSVC       0
IIFAMSVC       0
IRWELMOS       0
IIWELMOS       0
IRPINC3        0
IRFAMIN3       0
IIPINC3        0
IIFAMIN3       0
GOVTPROG       0
POVERTY3     341
TOOLONG        0
TROUBUND       0
PDEN10         0
COUTYP2        0
MAIIN102       0
AIIND102       0
ANALWT_C      

In [8]:
# NaN values in NRCH17_2, POVERTY3
train_df[train_df["NRCH17_2"].isnull()]

Unnamed: 0,IFATHER,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,PRXYDATA,...,POVERTY3,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP
367,4.0,,3.0,1.0,2.0,3.0,1.0,2.0,98.0,98.0,...,3.0,2.0,2.0,2.0,2.0,2.0,2.0,10993.170990,40007.0,1.0
932,4.0,,3.0,3.0,2.0,3.0,1.0,3.0,98.0,98.0,...,2.0,1.0,1.0,1.0,1.0,2.0,2.0,6655.489170,40049.0,2.0
996,4.0,,1.0,3.0,1.0,3.0,1.0,2.0,98.0,98.0,...,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2361.175166,40004.0,1.0
1351,4.0,,6.0,3.0,4.0,3.0,1.0,3.0,98.0,98.0,...,3.0,2.0,1.0,2.0,2.0,2.0,2.0,2651.961762,40036.0,1.0
4234,1.0,,6.0,1.0,4.0,3.0,2.0,3.0,99.0,99.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,903.538243,40025.0,1.0
5097,4.0,,4.0,1.0,1.0,3.0,1.0,3.0,98.0,98.0,...,2.0,1.0,2.0,1.0,1.0,2.0,2.0,6158.186864,40001.0,1.0
5165,4.0,,5.0,1.0,3.0,3.0,1.0,3.0,99.0,99.0,...,1.0,2.0,2.0,2.0,3.0,2.0,2.0,2776.520224,40022.0,2.0
6140,4.0,,3.0,3.0,2.0,3.0,1.0,3.0,98.0,98.0,...,1.0,2.0,2.0,2.0,3.0,2.0,2.0,8757.258365,40020.0,2.0
6297,4.0,,2.0,3.0,1.0,3.0,2.0,3.0,98.0,98.0,...,3.0,98.0,98.0,2.0,1.0,2.0,2.0,18919.282520,40014.0,2.0
6416,4.0,,3.0,1.0,2.0,1.0,1.0,1.0,99.0,99.0,...,3.0,1.0,2.0,2.0,2.0,2.0,2.0,7579.858039,40011.0,2.0


In [9]:
# Replacing NaN values in NRCHI7_2
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
train_df[["NRCH17_2", "POVERTY3"]] = imputer.fit_transform(train_df[["NRCH17_2", "POVERTY3"]].values).astype(int)

In [10]:
train_df.isnull().sum().max()

0

In [11]:
train_df.nunique().sort_values(ascending=False)[:5]
# np.unique(train_df["HLLOSRSN"].values)

ANALWT_C    45646
VESTR          50
HLCNOTMO       17
HLLOSRSN       17
IRWELMOS       13
dtype: int64

In [12]:
nominal_cats = ['IFATHER', 'HLLOSRSN', 'HLNVCOST', 'HLNVOFFR', 'HLNVREF', 'HLNVNEED', 'HLNVSOR', 'PRXRETRY', 'PRXYDATA', 'MEDICARE', 
                'CAIDCHIP', 'CHAMPUS', 'PRVHLTIN', 'GRPHLTIN', 'HLTINNOS', 'HLCNOTYR', 'IRMCDCHP', 'IRMEDICR', 'IRCHMPUS', 
                'IRPRVHLT', 'IROTHHLT', 'HLCALLFG', 'HLCALL99', 'ANYHLTI2', 'IRINSUR4', 'OTHINS', 'CELLNOTCL', 'CELLWRKNG', 
                'IRFAMSOC', 'IRFAMSSI', 'IRFSTAMP', 'IRFAMPMT', 'IRFAMSVC', 'GOVTPROG', 'TOOLONG', 'TROUBUND', 'COUTYP2',
                'VESTR', 'VEREP']
ordinal_cats = ['IIHHSIZ2', 'IIKI17_2', 'IIHH65_2', 'IIMCDCHP', 'IIMEDICR', 'IICHMPUS', 'IIPRVHLT', 'IIOTHHLT', 
                'IIINSUR4', 'IIFAMSOC', 'IIFAMSSI', 
                'IIFSTAMP', 'IIFAMPMT', 'IIFAMSVC', 'IIWELMOS', 'IRPINC3', 'IRFAMIN3', 'IIPINC3', 'IIFAMIN3', 
                'POVERTY3', 'PDEN10', 'MAIIN102', 'AIIND102']
continuous = ["ANALWT_C", 'NRCH17_2', 'IRHHSIZ2', 'IRKI17_2', 'IRHH65_2', 'HLCNOTMO', 'HLCLAST', 'IRWELMOS']

In [13]:
# Encoding Nominal Categories into dummies
train_df = pd.get_dummies(train_df, columns=nominal_cats, drop_first=True)

In [14]:
for cat in ordinal_cats:
    train_df[cat] = train_df[cat].astype("category", ordered=True, categories=range(0, 20))

  


In [15]:
# Converting HLCNOTMO, HLCLAST into Continous values by replacing 84 to 99 values to 0
columns = ["HLCNOTMO", "HLCLAST"]
for col in columns:
    train_df.loc[train_df[col] >= 50, col] = 0 

In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45716 entries, 0 to 45717
Columns: 207 entries, NRCH17_2 to VEREP_2.0
dtypes: category(23), float64(8), uint8(176)
memory usage: 11.8 MB


In [17]:
# normalising continous values
for col in continuous:
    scaler = StandardScaler()
    train_df[[col]] = scaler.fit_transform(train_df[[col]])

In [19]:
# SVM
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(train_df, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
y_pred = classifier.predict(train_df)

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef
accuracy_score(y_train, y_pred)

0.9472176043398373

In [28]:
confusion_matrix(y_train, y_pred)

array([[41985,   556],
       [ 1857,  1318]], dtype=int64)

In [29]:
matthews_corrcoef(y_train, y_pred)

0.5154945060282949

In [None]:
# 0.5154945060282949
