In [34]:
import pandas as pd
import numpy as np
import mlflow

In [2]:
# data loader
def load_data(file_name: str):
    # Load the dataset from a CSV file
    df = pd.read_csv(file_name)    
    return df

In [3]:
df = load_data('data/train.csv')

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [9]:
df['Cabin'].value_counts()

Cabin
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
D              3
              ..
E17            1
A24            1
C50            1
B42            1
C148           1
Name: count, Length: 147, dtype: int64

In [10]:
# preprocessing
def preprocess_data(df: pd.DataFrame):
    # encode gender
    df['Sex_Encoded'] = df['Sex'].map({'male': 0, 'female': 1})

    # fill missing values for 'Age' with the median
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # create deck feature from 'Cabin'
    df['Deck'] = df['Cabin'].str[0].fillna('U')  # 'U' for unknown
    df['Deck_Encoded'] = df['Deck'].map({
        'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4,
        'F': 5, 'G': 6, 'T': 7, 'U': 8
    })

    # fill missing values for 'Embarked' with the mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    # encode 'Embarked'
    df['Embarked_Encoded'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

    # drop unnecessary columns
    df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Deck', 'Embarked', 'Sex'])

    # convert all columns to floats
    df = df.astype(float)

    return df


In [11]:
preprocessed_df = preprocess_data(df)

In [None]:
preprocessed_df['Deck_Encoded'].value_counts()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_Encoded,Deck_Encoded,Embarked_Encoded
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,8.0,2.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,2.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,8.0,2.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,2.0,2.0
4,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,8.0,2.0
...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,8.0,2.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,1.0,2.0
888,0.0,3.0,28.0,1.0,2.0,23.4500,1.0,8.0,2.0
889,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,2.0,0.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
def train_model(preprocessed_df: pd.DataFrame):
    # Split data
    X = preprocessed_df.drop('Survived', axis=1)
    y = preprocessed_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Log parameters and metrics
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_metric("accuracy", acc)

    # Log model
    mlflow.sklearn.log_model(clf, "model")

In [13]:
import os
import toml
# Load secrets from secrets.toml
secrets = toml.load(".streamlit/secrets.toml")
aws_access_key_id = secrets["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = secrets["AWS_SECRET_ACCESS_KEY"]

# Set environment variables for AWS credentials
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

In [14]:
import boto3
dynamodb = boto3.resource('dynamodb', region_name='us-west-1')  # Update region if needed
table = dynamodb.Table('titanic_predictions')

In [16]:
test_data = pd.read_csv('data/test.csv')

In [28]:
preprocessed_df = preprocess_data(test_data)

In [29]:
preprocessed_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_Encoded,Deck_Encoded,Embarked_Encoded
0,3.0,34.5,0.0,0.0,7.8292,0.0,8.0,1.0
1,3.0,47.0,1.0,0.0,7.0000,1.0,8.0,2.0
2,2.0,62.0,0.0,0.0,9.6875,0.0,8.0,1.0
3,3.0,27.0,0.0,0.0,8.6625,0.0,8.0,2.0
4,3.0,22.0,1.0,1.0,12.2875,1.0,8.0,2.0
...,...,...,...,...,...,...,...,...
413,3.0,27.0,0.0,0.0,8.0500,0.0,8.0,2.0
414,1.0,39.0,0.0,0.0,108.9000,1.0,2.0,0.0
415,3.0,38.5,0.0,0.0,7.2500,0.0,8.0,2.0
416,3.0,27.0,0.0,0.0,8.0500,0.0,8.0,2.0


In [40]:
from decimal import Decimal

for idx, (_, row) in enumerate(preprocessed_df.iterrows()):
    item = {}
    for k, v in row.to_dict().items():
        if pd.isna(v):
            item[k] = None
        elif isinstance(v, float):
            item[k] = Decimal(str(v))
        elif isinstance(v, (np.integer, int)):
            item[k] = int(v)
        else:
            item[k] = v
    item["id"] = idx  # Use integer for Number type key
    table.put_item(Item=item)

In [50]:
# Scan the DynamoDB table
response = table.scan()
items = response['Items']

# If the table is large, handle pagination
while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    items.extend(response['Items'])

# Load into pandas DataFrame
df_dynamo = pd.DataFrame(items)
df_dynamo.head()  # Display the first few rows

Unnamed: 0,Fare,Deck_Encoded,Survived,Pclass,Embarked_Encoded,Sex_Encoded,id,Parch,SibSp,Age
0,7.925,8,0,3,2,0,251,0,0,20
1,8.05,8,0,3,2,0,187,0,2,17
2,31.3875,8,0,3,2,0,154,2,4,13
3,29.0,8,0,2,2,0,7,1,1,26
4,14.4542,8,0,3,0,0,115,0,1,18


In [51]:
df_dynamo.sort_values(by='id')

Unnamed: 0,Fare,Deck_Encoded,Survived,Pclass,Embarked_Encoded,Sex_Encoded,id,Parch,SibSp,Age
340,7.8292,8,0,3,1,0,0,0,0,34.5
325,7,8,0,3,2,1,1,0,1,47
142,9.6875,8,0,2,1,0,2,0,0,62
105,8.6625,8,1,3,2,0,3,0,0,27
200,12.2875,8,0,3,2,1,4,1,1,22
...,...,...,...,...,...,...,...,...,...,...
296,8.05,8,0,3,2,0,413,0,0,27
133,108.9,2,1,1,0,1,414,0,0,39
92,7.25,8,0,3,2,0,415,0,0,38.5
272,8.05,8,0,3,2,0,416,0,0,27


In [45]:
new_row = {
    "id": 420,  # Use a unique value for your primary key
    "Fare": Decimal(23),
    "Deck_Encoded": 8,
    "Pclass": 2,
    "Embarked_Encoded": 1,
    "Sex_Encoded": 0,
    "Parch": 0,
    "SibSp": 0,
    "Age": Decimal(22.5),
    "Survived": 0
}

table.put_item(Item=new_row)

{'ResponseMetadata': {'RequestId': '43SNHUA0OBPH1FKJU04IGFREPRVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sun, 22 Jun 2025 21:59:18 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '43SNHUA0OBPH1FKJU04IGFREPRVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [49]:
# Replace 420 with the id of the row you want to delete
table.delete_item(Key={'id': 420})

{'ResponseMetadata': {'RequestId': '29CSKV2K1RLI7JPHLMRRNDG0JVVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sun, 22 Jun 2025 22:01:44 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': '29CSKV2K1RLI7JPHLMRRNDG0JVVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [33]:
# WARNING: This will delete ALL items in your table!

# Get the primary key name (e.g., 'id')
primary_key = 'id'  # Change this if your key is different

# Scan for all items
response = table.scan()
items = response['Items']

# Handle pagination
while 'LastEvaluatedKey' in response:
    response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
    items.extend(response['Items'])

# Delete each item
for item in items:
    table.delete_item(Key={primary_key: item[primary_key]})

In [35]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
model = mlflow.pyfunc.load_model(model_uri="models:/titanic_model/2")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 13.72it/s]


In [37]:
predictions = model.predict(preprocessed_df)

In [38]:
preprocessed_df['Survived'] = predictions

In [39]:
preprocessed_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_Encoded,Deck_Encoded,Embarked_Encoded,Survived
0,3.0,34.5,0.0,0.0,7.8292,0.0,8.0,1.0,0
1,3.0,47.0,1.0,0.0,7.0000,1.0,8.0,2.0,0
2,2.0,62.0,0.0,0.0,9.6875,0.0,8.0,1.0,0
3,3.0,27.0,0.0,0.0,8.6625,0.0,8.0,2.0,1
4,3.0,22.0,1.0,1.0,12.2875,1.0,8.0,2.0,0
...,...,...,...,...,...,...,...,...,...
413,3.0,27.0,0.0,0.0,8.0500,0.0,8.0,2.0,0
414,1.0,39.0,0.0,0.0,108.9000,1.0,2.0,0.0,1
415,3.0,38.5,0.0,0.0,7.2500,0.0,8.0,2.0,0
416,3.0,27.0,0.0,0.0,8.0500,0.0,8.0,2.0,0


In [19]:


experiment_name = 'titanic_experiment2'
artifact_location = 's3://mlopszoomcamp-titanic-experiment/mlflow-artifacts/'
mlflow.create_experiment(experiment_name, artifact_location=artifact_location)


'3'

mlflow ui --backend-store-uri sqlite:///mlflow.db

In [8]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('titanic_experiment')

<Experiment: artifact_location='file:///c:/Users/tman0/Documents/mlops-zoomcamp-project/mlruns/2', creation_time=1750548520758, experiment_id='2', last_update_time=1750548520758, lifecycle_stage='active', name='titanic_experiment', tags={}>

In [9]:
train_model(preprocessed_df)



In [12]:
import mlflow.pyfunc
model = mlflow.pyfunc.load_model(model_uri="models:/titanic_model/1")
