In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
np.set_printoptions(precision=2)

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='liblinear')
rfe = RFE(clf)
rfe.fit(X,y)

In [5]:
loss_mean = df['Age'].astype('float').mean()
df['Age'].replace(np.NaN, loss_mean , inplace=True)
df['Age'] = df['Age'].astype('int')

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0000,C148,C


In [7]:
df['Sex'].value_counts()
from sklearn.preprocessing import LabelEncoder
gender_enc = LabelEncoder()
gender_enc.fit(df['Sex'])
gender_enc.transform(df['Sex'])
df['Sex'] = gender_enc.transform(df['Sex'])
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",0,19,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",1,26,0,0,111369,30.0000,C148,C


In [8]:
df.drop(['Name'], axis=1,inplace=True)

In [9]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [10]:
df['Ticket'].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [11]:
df.Cabin.value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

In [12]:
df.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [13]:
df.Cabin.nunique()

147

In [14]:

df.drop(['Ticket','Cabin'], axis=1,inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22,1,0,7.2500,S
1,2,1,1,0,38,1,0,71.2833,C
2,3,1,3,0,26,0,0,7.9250,S
3,4,1,1,0,35,1,0,53.1000,S
4,5,0,3,1,35,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27,0,0,13.0000,S
887,888,1,1,0,19,0,0,30.0000,S
888,889,0,3,0,29,1,2,23.4500,S
889,890,1,1,1,26,0,0,30.0000,C


In [15]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [16]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [17]:
df.drop(['Embarked'], axis=1,inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,1,22,1,0,7.2500
1,2,1,1,0,38,1,0,71.2833
2,3,1,3,0,26,0,0,7.9250
3,4,1,1,0,35,1,0,53.1000
4,5,0,3,1,35,0,0,8.0500
...,...,...,...,...,...,...,...,...
886,887,0,2,1,27,0,0,13.0000
887,888,1,1,0,19,0,0,30.0000
888,889,0,3,0,29,1,2,23.4500
889,890,1,1,1,26,0,0,30.0000


In [18]:
X= df.iloc[:,:-1]   #[: ,0:-1]
y = df['Survived']
print(X.shape,y.shape)


(891, 7) (891,)


In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [20]:
featSelector = SelectKBest(chi2,k=5)
featSelector.fit(X,y)
print(featSelector.scores_)
print(featSelector.get_feature_names_out())

[  3.31 549.    30.87  92.7   23.46   2.58  10.1 ]
['Survived' 'Pclass' 'Sex' 'Age' 'Parch']


In [21]:
features = featSelector.transform(X)
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain, xtest, ytrain, ytest = train_test_split(scaledX,y,test_size=.2,random_state=1)
# xtrain.shape,xtest.shape
m = KNeighborsClassifier()
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm = confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[105   1]
 [  0  73]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       106
           1       0.99      1.00      0.99        73

    accuracy                           0.99       179
   macro avg       0.99      1.00      0.99       179
weighted avg       0.99      0.99      0.99       179



In [22]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [23]:
clf = LogisticRegression(solver='liblinear')
rfe = RFE(clf)
rfe.fit(X,y)

RFE(estimator=LogisticRegression(solver='liblinear'))

In [24]:
rfe.get_feature_names_out()

array(['Survived', 'Pclass', 'Sex'], dtype=object)

In [25]:
n = int(input())
for i in range(1,n+1):
  for j in range(1,i+1):
    print(j, end = " ")
  print()

5
1 
1 2 
1 2 3 
1 2 3 4 
1 2 3 4 5 
