# Implementation of Logistic Regression
### Author: Tejaswini Patil
### Reg No: 20MAI0044

In [1]:
#1: Importing necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
#2: Read the dataset onto a variable
train = pd.read_csv("trainT.csv") #Titanic data-set
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#3: Fix the predictor variables
df = train[['Survived','Pclass','Sex','Age','Fare']] #implies columns

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [5]:
#4: Change the male to 1 and female to 0
df['Sex'] = df['Sex'].apply(lambda sex:1 if sex=='male' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [7]:
#5: Handling Missing Values
# df['Age'].isnull().sum()

In [8]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [9]:
df['Age'] = df['Age'].fillna(df['Age']).median()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
df['Age'].isnull().sum()

0

In [11]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [12]:
#6: Peeking into the dataframe
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,28.0,7.25
1,1,1,0,28.0,71.2833
2,1,3,0,28.0,7.925
3,1,1,0,28.0,53.1
4,0,3,1,28.0,8.05


In [13]:
#7: Set the predictor(X) and Response(Y) variables
X = df.drop('Survived',axis=1)
Y = df['Survived']

In [14]:
#8: Splitting the data into train test
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.3,random_state=25)

In [15]:
#9: Call the Logistic Regression Model
logit = LogisticRegression()
logit.fit(X_train,Y_train)    #Naive Bayes Classifier

LogisticRegression()

In [16]:
#10: Compute the Predictions or y_hat
Y_pred = logit.predict(X_test)

In [17]:
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0], dtype=int64)

In [18]:
#11: Confusion Matrix
confusion_matrix = confusion_matrix(Y_test,Y_pred)

In [19]:
confusion_matrix

array([[139,  26],
       [ 35,  68]], dtype=int64)

In [20]:
confusion = pd.crosstab(Y_pred,Y_test,rownames=['Real'],colnames=['Pred'])
confusion

Pred,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,139,35
1,26,68


In [21]:
#12: Accuracy Score
accuracy_score(Y_test,Y_pred)

0.7723880597014925

In [22]:
#13: Classification Report
report = classification_report(Y_test,Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       165
           1       0.72      0.66      0.69       103

    accuracy                           0.77       268
   macro avg       0.76      0.75      0.76       268
weighted avg       0.77      0.77      0.77       268



In [23]:
#14: Implementing Naive Baye's Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=25)
gnb = GaussianNB()
mnb = MultinomialNB()
y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)
#12: Accuracy Score
acc_gnb = accuracy_score(y_test,y_pred_gnb)
print(acc_gnb)
cnf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
print(cnf_matrix_gnb)

0.7686567164179104


TypeError: 'numpy.ndarray' object is not callable

In [24]:
y_pred_mnb = mnb.fit(X_train, y_train).predict(X_test)
cnf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
acc_mnb = accuracy_score(y_test,y_pred_mnb)
print(acc_mnb)
print(cnf_matrix_mnb)

TypeError: 'numpy.ndarray' object is not callable

In [25]:
report_gnb = classification_report(y_test,y_pred_gnb)
print(report_gnb)

              precision    recall  f1-score   support

           0       0.83      0.79      0.81       165
           1       0.68      0.74      0.71       103

    accuracy                           0.77       268
   macro avg       0.76      0.76      0.76       268
weighted avg       0.77      0.77      0.77       268



In [26]:
report_mnb = classification_report(y_test,y_pred_mnb)
print(report_mnb)

              precision    recall  f1-score   support

           0       0.71      0.81      0.75       165
           1       0.60      0.46      0.52       103

    accuracy                           0.68       268
   macro avg       0.65      0.63      0.64       268
weighted avg       0.67      0.68      0.66       268

