<a href="https://colab.research.google.com/github/yashparab7962/Data-Analytics/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)
# Ignore all DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from scipy.stats import chi2_contingency


In [None]:
train_df=pd.read_csv('/kaggle/input/titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

<h3>Data Preprocessing</h3>

In [None]:
print('Train Dataset Shape',train_df.shape)
print('------------------------------------')
print('Test Dataset Shape',test_df.shape)

In [None]:
train_df['dataset_type']='train'
test_df['dataset_type']='test'
df=pd.concat([train_df,test_df])
df.head()

In [None]:
print('Combine Dataset Shape',df.shape)
df['dataset_type'].value_counts()

In [None]:
#Check For Duplicate Records
df.duplicated().sum()

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10,5))
records=df.isnull().mean()*100
plt.xlabel('Features')
plt.ylabel('Percentage Of Missing Values')
plt.title('Missing Value Plot')
ax = records.plot(kind='bar')

for i, value in enumerate(records):
    plt.text(i, value + 0.5, f'{value:.2f}%', ha='center', va='bottom')

plt.show()

<h4>Based on the null value analysis, we observe that the Cabin feature has approximately 77% missing values.
Due to the high percentage of missing data, it's impractical to address this through imputation techniques.
Therefore, we recommend removing the Cabin column from the dataset to ensure data quality</h4>

<h3>Ticket Feature have Most featue value which least proabibility or impacted feature of prediction of survival
Hence We reomve Ticket Column from datasets
</h3>

In [None]:
df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
plt.figure(figsize=(10,5))

plt.subplot(2,2,1)
df['Age'].plot(kind='box')
plt.subplot(2,2,2)
df['Age'].plot(kind='hist')


plt.subplot(2,2,3)
df['Fare'].plot(kind='box')
plt.subplot(2,2,4)
df['Fare'].plot(kind='kde')



plt.tight_layout()

plt.show()


In [None]:
#Fare Column Have 512 Fare Is Extreme Outier hender we remove that records
#Fare Columns Have Skew right which might affeteced model performance ,Hence to overcome the skew effect we convert them into bins
df[['Age','Fare']].skew()

In [None]:
df['Embarked'].value_counts()

In [None]:
print('Avg Fare',df[df['Fare']>100]['Fare'].median())
print('Avg Age',df['Age'].median())
df['Embarked'].value_counts()

<h3>Missing Or Null Value Imputation</h3>

In [None]:
df['Age']=df['Age'].fillna(df['Age'].median())

df['Fare']=df['Fare'].fillna(df[df['Fare']>100]['Fare'].median())

df['Embarked']=df['Embarked'].fillna('S')


In [None]:
fare_bins = [0, 7.89, 14.45, 31.07, 100, 263]
# Define professional labels for each bin
fare_labels = ['Very Low Fare', 'Low Fare', 'Moderate Fare', 'High Fare', 'Very High Fare']
# Create a new column 'Fare_bin' by binning the 'Fare' column
df['Fare_bin'] = pd.cut(df['Fare'], bins=fare_bins, labels=fare_labels, include_lowest=True)

age_bins = [0, 12, 21, 28, 39, 80]

# Define professional labels for each bin
age_labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']

# Create a new column 'Age_bin' by binning the 'Age' column
df['Age_bin'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, include_lowest=True)

<ul>
    <li>
        <strong>Age Bins:</strong>
    </li>
    <ul>
        <li>Child: (0 - 12)</li>
        <li>Teenager: (12 - 21)</li>
        <li>Young Adult: (21 - 28)</li>
        <li>Adult: (28 - 39)</li>
        <li>Senior: (39 - 80)</li>
    </ul>
</ul>

<ul>
    <li>
        <strong>Fare Bins:</strong>
    </li>
    <ul>
        <li>Very Low Fare: (0 - 7.89)</li>
        <li>Low Fare: (7.89 - 14.45)</li>
        <li>Moderate Fare: (14.45 - 31.07)</li>
        <li>High Fare: (31.07 - 100)</li>
        <li>Very High Fare: (100 - 263)</li>
    </ul>
</ul>


In [None]:
df.head()

In [None]:
#remove unwanted columns from datasets
df.drop(['Fare',"Age"],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['Name']

In [None]:
def get_unique_titles(names_list):
    parts = names_list.split(',')
    if len(parts) > 1:
        title = parts[1].split('.')[0].strip()
        return title
    else:
        return 'Unknown'




In [None]:
df['Title'] = df['Name'].apply(get_unique_titles)


In [None]:
#We can seet that After Master There few title which are less than 10 records hence we convert them into Others
df['Title'].value_counts()

In [None]:
titles_to_keep = ['Mr', 'Miss', 'Mrs', 'Master']

df['Title'] = df['Title'].apply(lambda x: x if x in titles_to_keep else 'Others')

df['Title'].value_counts()

In [None]:
df.drop(['Name'],axis=1,inplace=True)

In [None]:
df.head()

<p>SibSp: Number of siblings and spouses aboard.</p>
<p>Parch: Number of parents and children aboard.</p>
<p>So we create a new feature called FamilySize.</p>


In [None]:
df['FamilySize']=df['SibSp']+df['Parch']
df.head()

In [None]:
df['FamilySize'].plot(kind='hist')

<p>Here’s how you can categorize family sizes into bins:</p>
<ul>
    <li><strong>Single (0)</strong>: No family members aboard.</li>
    <li><strong>Small Family (1-2)</strong>: 1 or 2 family members aboard.</li>
    <li><strong>Medium Family (3-4)</strong>: 3 or 4 family members aboard.</li>
    <li><strong>Large Family (5-6)</strong>: 5 or 6 family members aboard.</li>
    <li><strong>Very Large Family (7+)</strong>: 7 or more family members aboard.</li>
</ul>


In [None]:
bins = [-1, 0, 2, 4, 6, 10]  # -1 is included to capture 0
labels = ['Single', 'Small Family', 'Medium Family', 'Large Family', 'Very Large Family']

# Create the FamilySize bin feature
df['FamilySizeCategory'] = pd.cut(df['FamilySize'], bins=bins, labels=labels)
df['FamilySizeCategory'].value_counts()

In [None]:
df.drop(['FamilySize','SibSp','Parch'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.info()

<h3>Chi-Squared Test & Contingency Table </h3>

In [None]:
def chi_squared_test(df, categorical_variable):
    # Create a contingency table
    contingency_table = pd.crosstab(df[categorical_variable], df['Survived'])

    # Perform Chi-Squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    return chi2, p, contingency_table

# List of categorical variables to test
categorical_variables = ['Pclass','Sex','Embarked','Fare_bin', 'Age_bin', 'Title', 'FamilySizeCategory']

# Perform Chi-Squared test for each variable and store results
results = {}
for variable in categorical_variables:
    chi2_stat, p_value, contingency = chi_squared_test(df, variable)
    results[variable] = {
        'Chi-Squared Statistic': chi2_stat,
        'p-value': p_value,
        'Contingency Table': contingency
    }

# Display results
for variable, result in results.items():
    print(f"Variable: {variable}")
    print(f"Chi-Squared Statistic: {result['Chi-Squared Statistic']:.4f}")
    print(f"p-value: {result['p-value']:.4f}")
    print("Contingency Table:")
    print(result['Contingency Table'])
    print("\n" + "-"*40 + "\n")

<h3>Interpreting Results</h3>
Chi-Squared Statistic: A larger value indicates a stronger association between the categorical variable and survival.

p-value: Typically, a p-value less than 0.05 suggests that there is a statistically significant association between the categorical variable and survival.

By following Above process, you can analyze how different categorical factors relate to survival on the Titanic.

<div>
  <h2>The features most significantly associated with survival on the Titanic are as follows:</h2>
  <ul>
    <li><strong>Sex:</strong> Females had the highest survival rates.</li>
    <li><strong>Title:</strong> Titles reflect gender and social status, correlating strongly with survival chances.</li>
    <li><strong>Pclass:</strong> Higher-class passengers (1st class) had much better survival rates compared to lower classes.</li>
    <li><strong>Fare_bin:</strong> Passengers paying higher fares had better survival chances, consistent with class associations.</li>
    <li><strong>Embarked:</strong> The port of embarkation also plays a role, with certain ports having higher survival rates.</li>
    <li><strong>FamilySizeCategory:</strong> While significant, family size was less impactful than the above features.</li>
    <li><strong>Age_bin:</strong> Although significant, age had the weakest association among these features, with children having the highest survival rates.</li>
  </ul>
  <h3>In summary, Sex and Pclass emerge as the most impactful features regarding survival on the Titanic, closely followed by Title.</h3>
</div>


<h2>Data Visulization</h2>

<h4>Univarite Analysis</h4>

In [None]:
categorical_cols = ['Survived', 'Pclass', 'Sex', 'Embarked',
                    'Fare_bin', 'Age_bin', 'Title', 'FamilySizeCategory']

# Set up the subplot grid
n_cols = 2
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, n_rows * 4))
axes = axes.flatten()  # Flatten to easily index

# Create bar plots for each categorical column using sns.barplot
for idx, col in enumerate(categorical_cols):
    # Calculate counts
    counts = df[col].value_counts().reset_index()
    counts.columns = [col, 'Count']  # Rename columns for clarity

    # Create the bar plot
    sns.barplot(data=counts, x=col, y='Count', ax=axes[idx], palette='coolwarm')

    axes[idx].set_title(f'Count of {col}')
    axes[idx].set_ylabel('Count')
    axes[idx].set_xlabel(col)

    # Annotate the bars with their values
    for p in axes[idx].patches:
        axes[idx].annotate(f'{int(p.get_height())}',
                           (p.get_x() + p.get_width() / 2., p.get_height()),
                           ha='center', va='bottom',
                           fontsize=10, color='black',
                           xytext=(0, 5),  # 5 points vertical offset
                           textcoords='offset points')

# Remove any empty subplots
for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

<h4>Bivarite Analysis</h4>

<h3>
Now we analyze features with respect to survival to gain additional knowledge. We have the "Survived" column in the training data with 888 records. In univariate analysis, we consider the total count from both the training and test datasets. However, in bivariate analysis, we only work with the training data to identify patterns.
</h3>

In [None]:
#Survival Analysis by Demographics:
#Investigate survival rates across different age groups and genders.
Demographics_crosstab=pd.crosstab(index=df['Pclass'],columns=[df['Sex'],df['Age_bin'],df['Survived']])
plt.figure(figsize=(14, 6))
sns.heatmap(Demographics_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Age and Sex')
plt.ylabel('Pclass')
plt.xlabel('Sex and Survival Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**From the above chart, it is evident that male passengers in Pclass 3 who fall into the Teenager age category were the least likely to have survived.**

In [None]:
#Socioeconomic Impact:
#Analyze how class and fare influence survival rates.
Socioeconomic_crosstab=pd.crosstab(index=df['Pclass'],columns=[df['Fare_bin'],df['Survived']])
plt.figure(figsize=(14, 6))
sns.heatmap(Socioeconomic_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Age and Sex')
plt.ylabel('Pclass')
plt.xlabel('Fare and Survival Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**From the above chart, it is evident that out of a total of 888 passengers in Pclass 3, 264 did not survive. Among those who did not survive, 130 belonged to the "Very Low Fare" category, while 134 were in the "Low Fare" category.**

In [None]:
#Family and Social Structure Impact:
#Examine whether family size affected survival chances.

family_crosstab=pd.crosstab(index=df['FamilySizeCategory'],columns=[df['Survived']])
plt.figure(figsize=(5, 3))
sns.heatmap(family_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Age and Sex')
plt.ylabel('Family Size')
plt.xlabel('Survival Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
title_crosstab = pd.crosstab(index=df['Title'], columns=[df['Survived']])
# Travel Origin Analysis
embarked_crosstab = pd.crosstab(index=df['Embarked'], columns=[df['Survived']])

# Set up the figure with 1 row and 2 columns
plt.figure(figsize=(12, 5))

# Subplot for Title Analysis
plt.subplot(1, 2, 1)
sns.heatmap(title_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Title')
plt.ylabel('Title')
plt.xlabel('Survival Status')
plt.xticks(rotation=45)

# Subplot for Travel Origin Analysis
plt.subplot(1, 2, 2)
sns.heatmap(embarked_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Embarkation Point')
plt.ylabel('Embarkation Point')
plt.xlabel('Survival Status')
plt.xticks(rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()

**From the chart, it is evident that passengers with the title 'Mrs.' had the highest number of non-survivors,
totaling 436, while the embarkation point with the highest number of non-survivors was 'S,' with 427 passengers not surviving.**


In [None]:
family_crosstab=pd.crosstab(index=df['Survived'],columns=[df['Sex'],df['Embarked'],df['Pclass'],df['Fare_bin']])
plt.figure(figsize=(30, 5))
sns.heatmap(family_crosstab, annot=True, fmt='d', cmap='viridis', cbar_kws={'label': 'Count'})
plt.title('Survival Count by Sex, Embarked,Pclass,Fare_bin')
plt.ylabel('Survived')
plt.xlabel('Sex, Embarked,Pclass,Fare_bin Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

<h3>Model Building And Evalution</h3>

In [None]:
df.head()

In [None]:
df.columns

In [None]:
columns=['Sex', 'Embarked', 'Fare_bin', 'Age_bin', 'Title', 'FamilySizeCategory']

from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()


for column in columns:
    df[column]= label_encoder.fit_transform(df[column])

df.head()



<h3>Now split data into train and test so that we can built model on train datasets</h3>

In [None]:
clean_train_df=df[df['dataset_type']=='train']
clean_test_df=df[df['dataset_type']=='test']


In [None]:
print('Clean Train Dataset Shape',clean_train_df.shape)
print('------------------------------------')
print('Clean Test Dataset Shape',clean_test_df.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:

x=clean_train_df[['Pclass','Sex','Embarked','Fare_bin','Age_bin','Title','FamilySizeCategory']]
y=clean_train_df['Survived']

from sklearn.model_selection import train_test_split,GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
def train_model(model, model_name):
    print(f'Model: {model_name}')

    # Fit the model on the training data
    model.fit(x_train, y_train)

    # Predictions on the training data
    y_train_pred = model.predict(x_train)
    # Predictions on the testing data
    y_test_pred = model.predict(x_test)

    # Calculate accuracy scores
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    print(f'Training Accuracy Score: {train_accuracy:.2f}')
    print(f'Testing Accuracy Score: {test_accuracy:.2f}')

    # Generate classification report for testing data
    report = classification_report(y_test, y_test_pred)
    print('Classification Report:')
    print(report)

    return model


In [None]:
model_list = dict(
    knn=KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=2),
    svc=SVC(kernel='linear', random_state=0),
    logistic=LogisticRegression(),
    naive=GaussianNB(),
    tree=DecisionTreeClassifier(criterion='entropy', random_state=0),
    forest=RandomForestClassifier(n_estimators=50, criterion="entropy"),
    xgboost=XGBClassifier(),
    gradientboost=GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=1)
)
for key, value in model_list.items():
    print('*'*30)
    train_model(value,key)

<h3>Applying Hyperparameter Tuning with GridSearchCV for Model Optimization</h3>

In [None]:
# Define the model
model = XGBClassifier(random_state=42)

# Set up the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=0, n_jobs=-1)

# Fit the model
grid_search.fit(x_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the results
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Optionally, evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", test_accuracy)

In [None]:
model=XGBClassifier(
    colsample_bytree=1.0,
    learning_rate=0.01,
    max_depth=7,
    n_estimators=100,
    subsample=1.0,
    use_label_encoder=False,  # Optional, based on your XGBoost version
    eval_metric='mlogloss'    # Optional, based on your needs
)

train_model(model,'XGBOOST')

<h3>We will use the XGBoost model to predict the output, achieving an accuracy of 83%</h3>

In [None]:
clean_test_df.head()

In [None]:
clean_test_df.shape

In [None]:
X_test=clean_test_df[['Pclass','Sex','Embarked','Fare_bin','Age_bin','Title','FamilySizeCategory']]


In [None]:
submission_df = pd.DataFrame({
    'PassengerId': clean_test_df['PassengerId'],
    'Survived': model.predict(X_test)
})

In [None]:
submission_df.to_csv('submission.csv', index=False)
