In [None]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split 
import pandas as pd 
from pandas.plotting import scatter_matrix
import numpy as np
%matplotlib inline

In [None]:
# import the data for classification.
df = pd.read_csv('../input/abalone-dataset/abalone.csv')
df.head()

In [None]:
df.head()

In [None]:
# looking at the data we are working with 
df.info()

In [None]:
df['Sex'].value_counts()

In [None]:
df.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# Find the r value aka standard correlation coefficient 
corr_matrix = df.corr()
corr_matrix['Rings']

In [None]:
corr_matrix['Whole weight']

In [None]:
# plot variables against each other to look at the relationship 
attributes = ['Length', 'Diameter', 'Height', 'Rings']
scatter_matrix(df[attributes], figsize=(12, 8))

In [None]:
# convert text labels to integer labels
sex_label = LabelEncoder()
df['Sex'] = sex_label.fit_transform(df['Sex'])
df.head()

In [None]:
# looking at summary statistics
df.describe()

In [None]:
# define the features and the labels
X = df.drop(['Rings', 'Sex'], axis=1)

# separate the # of rings into three bins 
bin = [0, 8, 10, np.inf]
label = [0, 1, 2]
y = pd.cut(df['Rings'], bins=bin,labels=label)

# divide data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [None]:
# visualize our training set 
X_train.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
y_train.hist(bins=50)
plt.show()

In [None]:
# scales the data 
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
X_train

In [None]:
# finding the optimal number of estimators
def find_optimal_estimators(start_value, decrement):
    optimal_list = [] 
    while start_value >= decrement:
        clf = RandomForestClassifier(n_estimators=start_value, random_state=10)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        acc = accuracy_score(predictions, y_test)
        if not optimal_list:
            optimal_list.append([start_value, acc])
        elif acc > optimal_list[0][1]:
            optimal_list.clear()
            optimal_list.append([start_value, acc])
        else:
            pass
        print(f'Testing: {start_value} acc: {acc} current best number: {optimal_list[0][0]}')
        start_value -= decrement
    return optimal_list 

In [None]:
find_optimal_estimators(400, 10)

In [None]:
# compare different kernels for a support vector classification
def compare_kernels():
    kernels = ['rbf', 'linear', 'sigmoid', 'poly']
    for i in kernels:
        clf = SVC(kernel=i, gamma='scale')
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc = accuracy_score(pred, y_test)
        print(f'Kernel: {i} accuracy: {acc}')

In [None]:
compare_kernels()

In [None]:
# find the best number of neigbors
def find_neighbors(lim, decrement):
    while lim >= 1:
        knn_clf = KNeighborsClassifier(n_neighbors=lim)
        knn_clf.fit(X_train, y_train)
        knn_pred = knn_clf.predict(X_test)
        acc = accuracy_score(y_test, knn_pred)
        print(f'KNN number of neigbors: {lim} accuracy: {acc}')
        lim -= decrement

In [None]:
find_neighbors(20, 1)