In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('/content/Copy of sonar data.csv')

In [3]:
df.sample(5)

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
23,0.0293,0.0644,0.039,0.0173,0.0476,0.0816,0.0993,0.0315,0.0736,0.086,...,0.0035,0.0052,0.0083,0.0078,0.0075,0.0105,0.016,0.0095,0.0011,R
119,0.0346,0.0509,0.0079,0.0243,0.0432,0.0735,0.0938,0.1134,0.1228,0.1508,...,0.004,0.0122,0.0107,0.0112,0.0102,0.0052,0.0024,0.0079,0.0031,M
161,0.0217,0.0152,0.0346,0.0346,0.0484,0.0526,0.0773,0.0862,0.1451,0.211,...,0.0123,0.0067,0.0011,0.0026,0.0049,0.0029,0.0022,0.0022,0.0032,M
21,0.0099,0.0484,0.0299,0.0297,0.0652,0.1077,0.2363,0.2385,0.0075,0.1882,...,0.0173,0.0149,0.0115,0.0202,0.0139,0.0029,0.016,0.0106,0.0134,R
60,0.0135,0.0045,0.0051,0.0289,0.0561,0.0929,0.1031,0.0883,0.1596,0.1908,...,0.0037,0.0084,0.0102,0.0096,0.0024,0.0037,0.0028,0.003,0.003,R


In [4]:
df.shape

(207, 61)

In [8]:
df.isna().sum()

Unnamed: 0,0
0.0200,0
0.0371,0
0.0428,0
0.0207,0
0.0954,0
...,...
0.0180,0
0.0084,0
0.0090,0
0.0032,0


In [10]:
df.duplicated().sum()

np.int64(0)

In [19]:
df['R'].unique()

array(['R', 'M'], dtype=object)

In [13]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('R',axis=1),df['R'],test_size=0.2,random_state=42)

In [22]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Logistic Regression

In [122]:
scaler = StandardScaler()
pca = PCA(n_components=0.95)
log_model = LogisticRegression()
pipe = Pipeline([
    # ('scaler',scaler),
    # ('pca',pca),
    ('model',log_model)
])
pipe.fit(X_train,y_train)

In [123]:
y_pred = pipe.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))

accuracy : 0.7619047619047619
precision : 0.6086956521739131
recall : 0.9333333333333333


# knn

In [72]:
scaler = StandardScaler()
pca = PCA(n_components=0.95)
knn_model = KNeighborsClassifier()
pipe = Pipeline([
    ('scaler',scaler),
    ('pca',pca),
    ('model',knn_model)
])
pipe.fit(X_train,y_train)

In [73]:
y_pred = pipe.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))

accuracy : 0.7142857142857143
precision : 0.5882352941176471
recall : 0.6666666666666666


# SVC

In [74]:
scaler = StandardScaler()
pca = PCA(n_components=0.95)
svc_model = SVC()
pipe = Pipeline([
    ('scaler',scaler),
    ('pca',pca),
    ('model',svc_model)
])
pipe.fit(X_train,y_train)

In [75]:
y_pred = pipe.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))

accuracy : 0.7619047619047619
precision : 0.631578947368421
recall : 0.8


# randomforest classifier

In [108]:
scaler = StandardScaler()
pca = PCA(n_components=0.95)
rnd_model = RandomForestClassifier(n_estimators=100,max_leaf_nodes=8,random_state=42)
pipe = Pipeline([
    # ('scaler',scaler),
    # ('pca',pca),
    ('model',rnd_model)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))
print('f1',f1_score(y_test,y_pred))

# voting classifier

In [110]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('lr',log_model),('knn',knn_model),('svc',svc_model),('rnd_clf',rnd_model)],
    voting='hard'
)
voting_clf.fit(X_train,y_train)

In [111]:
for clf in (log_model,knn_model,svc_model,rnd_model,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.7619047619047619
KNeighborsClassifier 0.7380952380952381
SVC 0.7619047619047619
RandomForestClassifier 0.7857142857142857
VotingClassifier 0.7857142857142857


# ada boost

In [112]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),n_estimators=200,algorithm='SAMME',learning_rate=0.5
)
ada_clf.fit(X_train,y_train)



In [113]:
y_pred = ada_clf.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))

accuracy : 0.7380952380952381
precision : 0.5833333333333334
recall : 0.9333333333333333


# gradient boosting

In [114]:
from sklearn.ensemble import GradientBoostingClassifier
gbrt = GradientBoostingClassifier(max_depth=2,n_estimators=3,learning_rate=1)
gbrt.fit(X_train,y_train)

In [115]:
y_pred = gbrt.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))

accuracy : 0.7142857142857143
precision : 0.5652173913043478
recall : 0.8666666666666667


# final model is randomforest

In [131]:
rnd_model = RandomForestClassifier(n_estimators=100,max_leaf_nodes=8,random_state=837)
final_pipe = Pipeline([
    ('model',rnd_model)
])
final_pipe.fit(X_train,y_train)
y_pred = final_pipe.predict(X_test)
print('accuracy :',accuracy_score(y_test,y_pred))
print('precision :',precision_score(y_test,y_pred))
print('recall :',recall_score(y_test,y_pred))
print('f1',f1_score(y_test,y_pred))

accuracy : 0.8809523809523809
precision : 0.8125
recall : 0.8666666666666667
f1 0.8387096774193549


In [132]:
cross_val_score(final_pipe,X_train,y_train,cv=3).mean()

np.float64(0.8242424242424242)

In [125]:
# score = []
# for i in range(1000):
#     rnd_model = RandomForestClassifier(n_estimators=100,max_leaf_nodes=8,random_state=i)
#     rnd_model.fit(X_train,y_train)
#     y_pred = rnd_model.predict(X_test)
#     score.append(accuracy_score(y_test,y_pred))

In [126]:
np.argmax(score)

np.int64(837)

In [133]:
input_data = (0.0307,0.0523,0.0653,0.0521,0.0611,0.0577,0.0665,0.0664,0.1460,0.2792,0.3877,0.4992,0.4981,0.4972,0.5607,0.7339,0.8230,0.9173,0.9975,0.9911,0.8240,0.6498,0.5980,0.4862,0.3150,0.1543,0.0989,0.0284,0.1008,0.2636,0.2694,0.2930,0.2925,0.3998,0.3660,0.3172,0.4609,0.4374,0.1820,0.3376,0.6202,0.4448,0.1863,0.1420,0.0589,0.0576,0.0672,0.0269,0.0245,0.0190,0.0063,0.0321,0.0189,0.0137,0.0277,0.0152,0.0052,0.0121,0.0124,0.0055)

# changing the input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the np array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = final_pipe.predict(input_data_reshaped)
print(prediction)

if (prediction[0]=='R'):
  print('The object is a Rock')
else:
  print('The object is a mine')

[0]
The object is a mine


