In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from datetime import timedelta
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots



## This is an analysis of how trainees handled solving their first AI problem.
### A few days ago I attended a training where as one example of the use of artificial intelligence, users solved their first data analysis and modeling problem - the Titanic disaster. I collected the solutions they sent (user_name, score (after solution evaluation), file, timestamp).

In [3]:
df = pd.read_csv("/kaggle/input/a-story-of-some-lecture/users_titanic_scores.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

## Basic exploration

In [4]:
print(f"Number of participants: {len(df.user_name.unique())}")
print(f"Number of observations in the dataset: {df.shape[0]}")

Number of participants: 21
Number of observations in the dataset: 129


### 21 participants sent a total of 129 solutions! Let's see how long the participants struggled with the RMS Titanic passenger classification problem.

In [5]:
print(df.timestamp.min())
print(df.timestamp.max())
print(df.timestamp.max() - df.timestamp.min())

2023-07-13 11:21:17.294000
2023-07-13 12:49:02.936000
0 days 01:27:45.642000


### The first solutions were sent at 11:21 a.m., while the last solutions were sent at 12:49 p.m. Thus, the participants were committedly trying to improve their results for almost 1.5 hours.

### On the other hand, what is the number of solutions sent by individual participants? Did a small group of people send most of the solutions, or did everyone send a similar number of solutions?

In [6]:
user_counts = df['user_name'].value_counts()
fig = go.Figure([go.Bar(x=user_counts.index, y=user_counts.values, text=user_counts.values, textposition='auto')])

fig.update_xaxes(title_text="Participant name")
fig.update_yaxes(title_text="Number of solutions sent")

fig.update_layout(
    title_text="Number of solutions sent by each participant",
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()

In [7]:
print(f'Average number of solutions sent: {df["user_name"].value_counts().mean()}')
print(f'The most common value: {df["user_name"].value_counts().median()}')

Average number of solutions sent: 6.142857142857143
The most common value: 5.0


### Each participant rose to the challenge of improving their score, and most did so at least several times. The average number of solutions sent was 6 per person. The most common value in the number of solutions sent was 5.

### So let's see what the increase in the total sum of submitted solutions looked like over time.

In [8]:
df_sorted = df.sort_values('timestamp')
df_sorted['cumulative_count'] = np.arange(1, len(df_sorted) + 1)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_sorted['timestamp'], 
    y=df_sorted['cumulative_count'], 
    mode='lines',
    name='All Users'
))

fig.update_layout(
    title='Total number of solutions sent over time',
    xaxis_title='Time',
    yaxis_title='Total number of solutions sent',
    autosize=False,
    width=800,
    height=600,
)

fig.show()



### Let's split it down into 20-minute time intervals.

In [9]:
df_resampled = df.resample('20T', on='timestamp').count()


most_active_interval = df_resampled['score'].idxmax()

fig = go.Figure(data=[
    go.Bar(name='Scores', x=df_resampled.index, y=df_resampled['score'])
])


fig.update_xaxes(
    dtick=1200000  
)
fig.update_layout(
    title='Number of solutions submitted over time, 20-minute interval',
    xaxis_title='Time',
    yaxis_title='Number of solutions sent',
    autosize=False,
    width=800,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)

fig.show()

### For the first 20 minutes, almost 40 solutions were submitted. In the next 20-minute interval, the most corrections were sent for the entire time the RMS Titanic passenger classification problem was being wrestled with. In each subsequent time interval, the number of submitted corrections was less than in the previous one.

## Individual participants

### Let's see how the number of submitted solutions over time looked for individual trainees.

In [10]:
df['cumulative_count'] = df.groupby('user_name').cumcount()
fig = go.Figure()
for user in df['user_name'].unique():
    user_df = df[df['user_name'] == user]
    fig.add_trace(go.Scatter(
        x=user_df['timestamp'], 
        y=user_df['cumulative_count'], 
        mode='lines+markers',
        name=user
    ))

fig.update_layout(
    title='The sum of sent solutions over time',
    xaxis_title='Time',
    yaxis_title='Total solutions sent',
    autosize=False,
    width=800,
    height=600,
)
fig.show()

### Most people finished submitting their corrections a few minutes after 12 p.m. 25% of the participants struggled with the problem until at least 12:20 p.m.
### Let's visualize this data in yet another way, so that we can see how the trainees staggered their work on the solution over time.

In [11]:
df['normalized_score'] = (df['score'] - df['score'].min()) / (df['score'].max() - df['score'].min())
fig = go.Figure()

for user in df['user_name'].unique():
    user_df = df[df['user_name'] == user]
    fig.add_trace(go.Scatter(
        x=user_df['timestamp'], 
        y=[user]*len(user_df),
        mode='markers',
        marker=dict(
            size=user_df['normalized_score'] * 10,
        ),
        name=user
    ))


fig.update_layout(
    title='How often did users submit their solutions?',
    xaxis_title='Time',
    yaxis_title='Participants name/alias',
    autosize=False,
    width=1000,
    height=800,
)

fig.show()


### It can be seen that the strategies for solving the task over time varied. Some participants had periods of very hard work, while others improved their solution in a more systematic way.


## Did score improved?

### The size of the point on the graph reflects the height of the achieved score (for normalized data), while the differences are so small that we need to look at the data in a different way. Let's first see what the height of the achieved "score" looked like over time for the various solutions submitted.

In [12]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df['timestamp'], 
        y=df['score'], 
        mode='markers',
        marker=dict(
            size=6,
            color=df['normalized_score'],
            colorscale='Viridis',
            colorbar=dict(title="Normalized Score"),
            showscale=True
        ),
        text=df['user_name'],
        hoverinfo='text',
        name='Score',
    )
)

fig.update_xaxes(title_text="Time")
fig.update_yaxes(title_text="Score")
fig.update_layout(
    title="The amount of results obtained (score) of the submitted solutions over time",
    showlegend=True,
)


### When you try to improve your solutions, you will see a much larger point range in the score obtained than in your first attempts. What emerges is a picture of learning by trial and error! : )

In [13]:
user_scores = df.groupby('user_name')['score'].agg(['min', 'max']).reset_index()
fig = go.Figure()
for i in range(len(user_scores)):
    fig.add_trace(go.Scatter(
        x=[user_scores['user_name'][i], user_scores['user_name'][i]], 
        y=[user_scores['min'][i], user_scores['max'][i]], 
        mode='lines+markers',
        name=user_scores['user_name'][i]
    ))

fig.update_layout(
    title="Range of results obtained by each user",
    xaxis_title="Name/alias of participant",
    yaxis_title="Score",
    autosize=False,
    width=800,
    height=600,
)

fig.show()

### For most users, the difference in the scores obtained was not large. For some, however, RNG(random number generator) proved less merciful. :)

### Let's see what the overall trend of the average score obtained over time looked like for the 10-minute interval.

In [14]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values('timestamp')
df_time_indexed = df.set_index('timestamp')
df_resampled = df_time_indexed['score'].resample('10T').mean()
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_resampled.index, 
    y=df_resampled, 
    mode='lines',
    name='Tredn wyniku'
))
fig.update_layout(
    title='Average score over time, 10 minute time interval',
    xaxis_title='Time',
    yaxis_title='Score',
    autosize=False,
    width=800,
    height=600,
)

fig.show()

### You can see a definite improvement in the results obtained while struggling with the classification task.

## Fighting for a place on the leaderboard.

### Let's see how the battle for first place looked on the scoreboard.

In [15]:
df['cumulative_max_score'] = df.groupby('user_name')['score'].cummax()
fig = go.Figure()

for user in df['user_name'].unique():
    user_df = df[df['user_name'] == user]
    fig.add_trace(go.Scatter(
        x=user_df['timestamp'], 
        y=user_df['cumulative_max_score'], 
        mode='lines+markers',
        name=user
    ))
fig.update_yaxes(range=[74.5, 80.5])

fig.update_layout(
    title='The maximum score obtained by the participant during the time',
    xaxis_title='Time',
    yaxis_title='Score',
    autosize=False,
    width=1200,
    height=600,
)

fig.show()

### The winners of the first five places fought among themselves from the beginning of the struggle. The final shape of the front runners did not change from 12 o'clock. The runners in the middle of the pile improved their results usually until 12 o'clock, while some In the final moments of the struggle significantly improved the scores obtained.

## Thank you. It will be my pleasure to hear your feedback : )