In [1]:
## Required Imports

import pandas as pd
import pandas_profiling
from sklearn import preprocessing

In [2]:
## Load the text file

with open('data.txt') as f:
    data = f.readlines()
data[:5]

['5.1,3.5,1.4,0.2,Iris-setosa\n',
 '4.9,3.0,1.4,0.2,Iris-setosa\n',
 '4.7,3.2,1.3,0.2,Iris-setosa\n',
 '4.6,3.1,1.5,0.2,Iris-setosa\n',
 '5.0,3.6,1.4,0.2,Iris-setosa\n']

In [3]:
## Convert lines to list of lists

flowers = []
for row in data:
    flowers.append(row.replace('\n','').split(','))
flowers[:6]

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
 ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa']]

In [4]:
## Create Dataframe

columns = ['f1', 'f2','f3','f4', 'target']
df = pd.DataFrame(flowers, columns=columns)

In [5]:
## Class distribution

df.groupby('target').size()

target
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

In [6]:
## Data Description

df.describe()

Unnamed: 0,f1,f2,f3,f4,target
count,150.0,150.0,150.0,150.0,150
unique,35.0,23.0,43.0,22.0,3
top,5.0,3.0,1.5,0.2,Iris-virginica
freq,10.0,26.0,14.0,28.0,50


In [7]:
## Pandas profiling library to analyse and visualize the data even better 

pandas_profiling.ProfileReport(df)



In [8]:
## Convert classes to Categorical

label_encoder = preprocessing.LabelEncoder()

df['target']= label_encoder.fit_transform(df['target']) 
  
df['target'].unique()

array([0, 1, 2])

In [9]:
## Data Preview

df.head()

Unnamed: 0,f1,f2,f3,f4,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [10]:
## Split Data into Train and Test

from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3)

print('Train size: ', len(train))
print('Test size: ', len(test))

Train size:  105
Test size:  45


In [11]:
## Separate features columns and target column for train and test set both

X_train = train.drop(columns=['target'])
y_train = train['target']

X_test = test.drop(columns=['target'])
y_test = test['target']

In [12]:
## Implementing Logistic Regression (training and prediction)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')

model.fit(X_train, y_train)

predictions = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
## Evaluation

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report



print(confusion_matrix(y_test, predictions))

print(accuracy_score(y_test, predictions))

print(classification_report(y_test, predictions))

[[13  0  0]
 [ 0 15  1]
 [ 0  2 14]]
0.9333333333333333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.88      0.94      0.91        16
           2       0.93      0.88      0.90        16

    accuracy                           0.93        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.93      0.93      0.93        45

