In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load the uploaded data file
file_path = '/kaggle/input/conpe1/train.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure and contents
data.head()


In [None]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
missing_values[missing_values > 0]


In [None]:
# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Cabin'].fillna('Unknown', inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Check if all missing values are filled
missing_values_after = data.isnull().sum()
missing_values_after[missing_values_after > 0]


In [None]:
import re

# Create new feature: Family Size
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Extract Title from Name
data['Title'] = data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))

# Simplify the titles
title_mappings = {"Mr": "Mr", "Miss": "Miss", "Mrs": "Mrs", 
                  "Master": "Master", "Dr": "Officer", "Rev": "Officer", 
                  "Col": "Officer", "Major": "Officer", "Mlle": "Miss", 
                  "Countess": "Royalty", "Ms": "Mrs", "Lady": "Royalty", 
                  "Jonkheer": "Royalty", "Don": "Royalty", "Sir" : "Royalty", 
                  "Mme": "Mrs", "Capt": "Officer", "Dona": "Royalty"}
data['Title'] = data['Title'].map(title_mappings)

# Check the new columns
data[['FamilySize', 'Title']].head()


In [None]:
# Create new feature: Cabin Type (using the first letter of the Cabin)
data['CabinType'] = data['Cabin'].apply(lambda x: x[0])

# Check the new column
data['CabinType'].value_counts()


In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of 'Fare'
plt.figure(figsize=(10, 6))
plt.hist(data['Fare'], bins=40, color='blue', edgecolor='black')
plt.title('Distribution of Fare')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution of 'Age'
plt.figure(figsize=(10, 6))
plt.hist(data['Age'], bins=40, color='green', edgecolor='black')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Categorize 'Fare' into different bins
data['FareCategory'] = pd.cut(data['Fare'], bins=[0, 10, 20, 30, 100, float('inf')], labels=[1, 2, 3, 4, 5])

# Categorize 'Age' into different bins
data['AgeCategory'] = pd.cut(data['Age'], bins=[0, 12, 18, 30, 50, float('inf')], labels=[1, 2, 3, 4, 5])

# Check the new features
data[['FareCategory', 'AgeCategory']].head()


In [None]:
# Display the final form of the dataset
data.head()


In [None]:
# Splitting the training data into features and target variable
X_train = data.drop(['PassengerId', 'Perished', 'Name', 'Ticket', 'Cabin'], axis=1)
y_train = data['Perished']

# Load the test data
test_data_path = '/kaggle/input/conpe1/test.csv'
test_data = pd.read_csv(test_data_path)

# Apply the same preprocessing to the test data
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Cabin'].fillna('Unknown', inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['Title'] = test_data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
test_data['Title'] = test_data['Title'].map(title_mappings)
test_data['CabinType'] = test_data['Cabin'].apply(lambda x: x[0])
test_data['FareCategory'] = pd.cut(test_data['Fare'], bins=[0, 10, 20, 30, 100, float('inf')], labels=[1, 2, 3, 4, 5])
test_data['AgeCategory'] = pd.cut(test_data['Age'], bins=[0, 12, 18, 30, 50, float('inf')], labels=[1, 2, 3, 4, 5])
X_test = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Display the first few rows of the processed test data
X_test.head()


In [None]:
X_train.head()

In [None]:
import pandas as pd

# 仮定：X_train は既にあなたのデータセットです。
# ワンホットエンコーディングを適用するカラムを選択します。
columns_to_encode = ['Sex', 'Embarked', 'Title', 'CabinType']

# pd.get_dummiesを使用してワンホットエンコーディングを適用します。
X_train_encoded = pd.get_dummies(X_train, columns=columns_to_encode)

# エンコードされたデータセットを確認します。
print(X_train_encoded.head())

import pandas as pd

# 仮定：X_train は既にあなたのデータセットです。
# ワンホットエンコーディングを適用するカラムを選択します。
columns_to_encode = ['Sex', 'Embarked', 'Title', 'CabinType']

# pd.get_dummiesを使用してワンホットエンコーディングを適用します。
y_train_encoded = pd.get_dummies(y_train, columns=columns_to_encode)

# エンコードされたデータセットを確認します。
print(y_train_encoded.head())



In [None]:
# 'Perished' カラムを作成し、死亡した場合は 1、生きている場合は 0 とします
y_train = y_train_encoded.apply(lambda x: 1 if x[0] else 0, axis=1)

y_train.head()

In [None]:
# 各列に含まれる NaN 値の数を確認
nan_counts = X_train_encoded.isna().sum()
X_train_encoded.FareCategory.head()

In [None]:
# FareCategory の最頻値を計算
fare_category_mode = X_train_encoded['FareCategory'].mode()[0]

# FareCategory の欠損値を最頻値で置換
X_train_encoded['FareCategory'].fillna(fare_category_mode, inplace=True)


In [None]:
# 'Perished' カラムを作成し、死亡（True）の場合は 1、そうでない場合は 0 とする
y_train = y_train_encoded.apply(lambda x: 1 if x[0] else 0, axis=1)
y_train.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Perform cross-validation and store the results
# Perform cross-validation and store the results
model_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_encoded, y_train, cv=5, scoring='accuracy')
    model_scores[name] = scores.mean()

model_scores


In [None]:
import pandas as pd

# 仮定：X_train は既にあなたのデータセットです。
# ワンホットエンコーディングを適用するカラムを選択します。
columns_to_encode = ['Sex', 'Embarked', 'Title', 'CabinType']

# pd.get_dummiesを使用してワンホットエンコーディングを適用します。
X_train_encoded = pd.get_dummies(X_train, columns=columns_to_encode)

# エンコードされたデータセットを確認します。
print(X_train_encoded.head())


In [None]:
# FareCategory の最頻値を計算
fare_category_mode = X_train_encoded['FareCategory'].mode()[0]

# FareCategory の欠損値を最頻値で置換
X_train_encoded['FareCategory'].fillna(fare_category_mode, inplace=True)

# 再度 NaN 値の有無を確認
nan_counts = X_train_encoded.isna().sum()
print(nan_counts)


In [None]:
X_test.columns

In [None]:
X_train_encoded.columns

In [None]:
# テストデータにワンホットエンコーディングを適用
X_test_encoded = pd.get_dummies(X_test)

# トレーニングデータにあってテストデータにないカラムを追加（値は0で埋める）
for column in X_train_encoded.columns:
    if column not in X_test_encoded.columns:
        X_test_encoded[column] = 0

# テストデータにあってトレーニングデータにないカラムを削除
X_test_encoded = X_test_encoded[X_train_encoded.columns]

# エンコードされたテストデータセットを確認
print(X_test_encoded.head())


In [None]:
nan_counts = X_test_encoded.isna().sum()
print(nan_counts)

In [None]:
X_test_encoded.fillna(X_test_encoded.Fare.median(),inplace=True)

In [None]:
# Train the Gradient Boosting model on the entire training set using the encoded data
gradient_boosting_model = GradientBoostingClassifier()
gradient_boosting_model.fit(X_train_encoded, y_train)

# Make predictions on the encoded test set
predictions = gradient_boosting_model.predict(X_test_encoded)

# Display the first few predictions
predictions[:10]


In [None]:
predictions.shape

In [None]:
X_test.head()

In [None]:
data.head()

In [None]:
# DataFrameを作成（ここでは'id'をテストデータセットのIDカラム名と仮定）
submission = pd.DataFrame({
    "PassengerId": data_pred_pre.PassengerId,
    'Perished': predictions
})

# CSVファイルに保存
submission.to_csv('submission_gpt.csv', index=False)


In [None]:
data.shape

In [None]:
data_pred_pre=pd.read_csv('/kaggle/input/conpe1/test.csv')

In [None]:
data_pred_pre.shape