In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import column_or_1d
import numpy as np

#https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451

In [2]:
# set seed for random expertiment 
seed = 1234
num_folds = 10
n_jobs = -1
hypertuned_experiment = False
is_save_results = False


# Parsing Dataset

In [3]:
class SonarParser(object):
    def __init__(self):

        self.URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
        self.name = "sonar"
        self.file_name = 'sonar.csv'
        self.file_path = "sonar.csv"
        self.label_col = "60"
        self.X, self.y = self._parse_file()
        self.all = pd.concat([self.X, self.y], axis=1)
        # Metrica Scelta per il test
        self.metric = "accuracy"
        self._print_stats()

    def _parse_file(self,):
        """
            -Read csv data
            -Drop nan values
            -Keep only numeric columns
            -Split to X for features and y for labels
        """
        data = pd.read_csv(self.file_path)
        # rimuove i valori Nan
        data_cleaned = data.dropna()

        X, y = data_cleaned.drop(columns=[self.label_col]), data_cleaned[self.label_col]

        # keep only numeric features
        X = X.loc[:, X.dtypes == np.float64].dropna()

        return X, y

    def save_to_csv(self):
        save_path = os.path.join("..", "..", "data", "interim", self.file_name)
        self.all.to_csv(save_path, index=False)

    def _print_stats(self):
        print("#"*30 + " Start Dataset - " + self.name + " Stats " + "#"*30)
        print("Dataset shape:", self.all.shape)
        print("Counts for each class:")
        print(self.y.value_counts())
        print("Sample of first 5 rows:")
        print(self.all.head(5))
        print("#"*30 + " End Dataset Stats " + "#"*30)


In [4]:
parser= SonarParser()


############################## Start Dataset - sonar Stats ##############################
Dataset shape: (208, 61)
Counts for each class:
M    111
R     97
Name: 60, dtype: int64
Sample of first 5 rows:
        0       1       2       3       4       5       6       7       8  \
0  0.0200  0.0371  0.0428  0.0207  0.0954  0.0986  0.1539  0.1601  0.3109   
1  0.0453  0.0523  0.0843  0.0689  0.1183  0.2583  0.2156  0.3481  0.3337   
2  0.0262  0.0582  0.1099  0.1083  0.0974  0.2280  0.2431  0.3771  0.5598   
3  0.0100  0.0171  0.0623  0.0205  0.0205  0.0368  0.1098  0.1276  0.0598   
4  0.0762  0.0666  0.0481  0.0394  0.0590  0.0649  0.1209  0.2467  0.3564   

        9  ...      51      52      53      54      55      56      57  \
0  0.2111  ...  0.0027  0.0065  0.0159  0.0072  0.0167  0.0180  0.0084   
1  0.2872  ...  0.0084  0.0089  0.0048  0.0094  0.0191  0.0140  0.0049   
2  0.6194  ...  0.0232  0.0166  0.0095  0.0180  0.0244  0.0316  0.0164   
3  0.1264  ...  0.0121  0.0036  0.0150

In [5]:
x= parser.X
y = parser.y
scoring = parser.metric
y

0      R
1      R
2      R
3      R
4      R
      ..
203    M
204    M
205    M
206    M
207    M
Name: 60, Length: 208, dtype: object

In [6]:
y_sonar = column_or_1d(y, warn=False)
y_sonar

array(['R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'R', 'R', 'R', 'R', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y_sonar, test_size=0.20, random_state=seed)




y_train

array(['M', 'R', 'R', 'R', 'R', 'R', 'M', 'R', 'R', 'R', 'M', 'M', 'M',
       'R', 'R', 'M', 'M', 'R', 'M', 'M', 'M', 'M', 'R', 'M', 'M', 'R',
       'M', 'R', 'M', 'M', 'R', 'R', 'R', 'M', 'M', 'M', 'R', 'R', 'R',
       'M', 'R', 'M', 'R', 'R', 'R', 'R', 'M', 'M', 'M', 'M', 'R', 'R',
       'R', 'R', 'M', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'R', 'R', 'M',
       'M', 'M', 'R', 'M', 'M', 'M', 'R', 'R', 'M', 'M', 'R', 'R', 'R',
       'M', 'M', 'M', 'R', 'R', 'M', 'M', 'M', 'R', 'M', 'M', 'R', 'R',
       'R', 'M', 'M', 'R', 'R', 'R', 'R', 'M', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'R', 'R', 'M', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R',
       'M', 'M', 'R', 'M', 'R', 'R', 'R', 'M', 'M', 'R', 'R', 'M', 'M',
       'M', 'R', 'R', 'R', 'M', 'R', 'M', 'M', 'M', 'R', 'R', 'R', 'R',
       'M', 'M', 'R', 'R', 'M', 'M', 'M', 'M', 'R', 'M', 'M', 'M', 'R',
       'M', 'R', 'M', 'M', 'M', 'M', 'M', 'R', 'R', 'R'], dtype=object)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler



In [9]:
model= LogisticRegression(penalty="l2",verbose=1,max_iter=9000)

In [10]:
kfold = model_selection.KFold(n_splits=num_folds)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=n_jobs, scoring=scoring)

In [11]:
model.fit(X_train, y_train)
curr_test_score = model.score(X_test, y_test)
curr_train_score= model.score(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [12]:
curr_train_score

0.8554216867469879

In [13]:
curr_test_score

0.6904761904761905

# Prediction

In [22]:
model.predict(np.array(X_test.iloc[0]).reshape(1,-1))



array(['R'], dtype=object)

In [15]:
y_test

array(['R', 'R', 'R', 'M', 'R', 'M', 'M', 'M', 'M', 'R', 'M', 'M', 'R',
       'M', 'M', 'M', 'R', 'R', 'M', 'M', 'R', 'M', 'M', 'M', 'R', 'R',
       'M', 'R', 'M', 'M', 'M', 'M', 'R', 'M', 'M', 'R', 'M', 'R', 'R',
       'M', 'M', 'M'], dtype=object)