## Feature selection techniques

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif

# Load the dataset
df = pd.read_csv("C:\\Users\\svish\\OneDrive\\Desktop\\titanic-dataset\\train.csv")

df = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
df.dropna(inplace=True)

#  Encode categorical variables
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])           
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked']) 

#  features (X) and target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

# Correlation - 
print("Correlation with target (Survived):")
print(df.corr()['Survived'])

#  Thresholding - 
correlation = df.corr()['Survived']
selected_features = correlation[correlation.abs() > 0.1].index.tolist()
if 'Survived' in selected_features:
    selected_features.remove('Survived')
print("\nFeatures with correlation > 0.1:")
print(selected_features)

# Chi-Square Test
X_chi = X.copy()
X_chi['Age'] = X_chi['Age'].astype(int)
X_chi['Fare'] = X_chi['Fare'].astype(int)

chi_selector = SelectKBest(score_func=chi2, k='all')
chi_selector.fit(X_chi, y)

print("\nChi-Square scores:")
for feature, score in zip(X.columns, chi_selector.scores_):
    print(f"{feature}: {score:.2f}")

# ANOVA (Analysis of Variance) - 
anova_selector = SelectKBest(score_func=f_classif, k='all')
anova_selector.fit(X, y)

print("\nANOVA F-scores:")
for feature, score in zip(X.columns, anova_selector.scores_):
    print(f"{feature}: {score:.2f}")


Correlation with target (Survived):
Survived    1.000000
Pclass     -0.356462
Sex        -0.536762
Age        -0.082446
Fare        0.266100
Embarked   -0.181979
Name: Survived, dtype: float64

Features with correlation > 0.1:
['Pclass', 'Sex', 'Fare', 'Embarked']

Chi-Square scores:
Pclass: 28.24
Sex: 74.62
Age: 34.26
Fare: 4137.44
Embarked: 8.96

ANOVA F-scores:
Pclass: 103.35
Sex: 287.35
Age: 4.86
Fare: 54.11
Embarked: 24.32
