In [1]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Calculate the total number of observations
total_observations = len(data)

# Count the number of times the "machine_status" is "BROKEN"
broken_count = data[data['machine_status'] == 'BROKEN'].shape[0]

# Calculate the probability of failure
probability_of_failure = broken_count / total_observations

# Print the results
print(f'Total observations: {total_observations}')
print(f'Number of failures: {broken_count}')
print(f'Probability of failure: {probability_of_failure}')


Total observations: 130097
Number of failures: 5
Probability of failure: 3.843286163401154e-05


In [3]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp just in case
data = data.sort_values(by='timestamp')

# Initialize variables
recovery_times = []
in_recovery = False
recovery_start = None

# Iterate over the rows of the DataFrame
for index, row in data.iterrows():
    if row['machine_status'] == 'RECOVERING' and not in_recovery:
        # Start of a recovery period
        in_recovery = True
        recovery_start = row['timestamp']
    elif row['machine_status'] != 'RECOVERING' and in_recovery:
        # End of a recovery period
        in_recovery = False
        recovery_end = row['timestamp']
        recovery_time = (recovery_end - recovery_start).total_seconds() / 60  # Convert to minutes
        recovery_times.append(recovery_time)
        if len(recovery_times) == 7:
            break

# Print the recovery times for the first 7 failures
for i, rt in enumerate(recovery_times, 1):
    print(f'Recovery time after failure {i}: {rt} minutes')

Recovery time after failure 1: 944.0 minutes
Recovery time after failure 2: 3110.0 minutes
Recovery time after failure 3: 1312.0 minutes
Recovery time after failure 4: 605.0 minutes
Recovery time after failure 5: 8390.0 minutes
Recovery time after failure 6: 41.0 minutes
Recovery time after failure 7: 75.0 minutes


In [5]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp just in case
data = data.sort_values(by='timestamp')

# Calculate the total number of observations
total_observations = len(data)

# Count the number of times the "machine_status" is "BROKEN"
broken_count = data[data['machine_status'] == 'BROKEN'].shape[0]

# Calculate the probability of failure
probability_of_failure = broken_count / total_observations

# Print the overall results
print(f'Total observations: {total_observations}')
print(f'Number of failures: {broken_count}')
print(f'Probability of failure: {probability_of_failure}')

# Calculate the average probability of failures per month
data['month'] = data['timestamp'].dt.to_period('M')
monthly_failure_counts = data[data['machine_status'] == 'BROKEN'].groupby('month').size()
monthly_total_counts = data.groupby('month').size()
monthly_probabilities = monthly_failure_counts / monthly_total_counts

average_monthly_probability = monthly_probabilities.mean()

# Print the average monthly probability of failures
print(f'Average monthly probability of failures: {average_monthly_probability}')


Total observations: 220320
Number of failures: 7
Probability of failure: 3.177196804647785e-05
Average monthly probability of failures: 3.976254480286738e-05


In [9]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp just in case
data = data.sort_values(by='timestamp')

# Create columns for 'date' and 'month'
data['date'] = data['timestamp'].dt.date
data['month'] = data['timestamp'].dt.to_period('M')

# Group by date to determine if there was a failure on each day
daily_status = data.groupby('date')['machine_status'].apply(lambda x: 'BROKEN' in x.values).reset_index()
daily_status.columns = ['date', 'is_broken']
daily_status['month'] = pd.to_datetime(daily_status['date']).dt.to_period('M')

# Calculate the number of days with failures and without failures for each month
monthly_summary = daily_status.groupby('month')['is_broken'].value_counts().unstack(fill_value=0)
monthly_summary.columns = ['Not Broken', 'Broken']
monthly_summary['Total Days'] = monthly_summary['Not Broken'] + monthly_summary['Broken']
monthly_summary['Percentage of Failures'] = (monthly_summary['Broken'] / monthly_summary['Total Days']) * 100

# Calculate the average percentage of failures per month across all periods
average_percentage_failures = monthly_summary['Percentage of Failures'].mean()

# Print the results for each month
print(monthly_summary)

# Print the average percentage of failures per month across the entire period
print(f'Average percentage of failures per month across all periods: {average_percentage_failures:.2f}%')


         Not Broken  Broken  Total Days  Percentage of Failures
month                                                          
2018-04          28       2          30                6.666667
2018-05          29       2          31                6.451613
2018-06          29       1          30                3.333333
2018-07          29       2          31                6.451613
2018-08          31       0          31                0.000000
Average percentage of failures per month across all periods: 4.58%


In [11]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp just in case
data = data.sort_values(by='timestamp')

# Identify the indices where the machine_status is "BROKEN" or "RECOVERING"
disturbance_indices = data[data['machine_status'].isin(['BROKEN', 'RECOVERING'])].index

# Initialize lists to store results
results = []

# Loop through each disturbance index and find the minimum performance after the disturbance
for idx in disturbance_indices:
    # Find the next index where the machine status is neither "BROKEN" nor "RECOVERING"
    next_idx = data.loc[idx:].index[data.loc[idx:]['machine_status'].isin(['BROKEN', 'RECOVERING']) == False]

    # Ensure we have a valid next index
    if len(next_idx) == 0:
        continue
    next_idx = next_idx[0]

    # Get the minimum performance level between the disturbance index and the next non-disturbance index
    min_performance = data.loc[idx:next_idx]['sensor_06'].min()

    # Append the result
    results.append({
        'Disturbance Type': data.loc[idx, 'machine_status'],
        'Disturbance Timestamp': data.loc[idx, 'timestamp'],
        'Min Performance': min_performance
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by the disturbance timestamp
results_df = results_df.sort_values(by='Disturbance Timestamp').reset_index(drop=True)

# Display the results
print(results_df)


      Disturbance Type Disturbance Timestamp  Min Performance
0               BROKEN   2018-04-12 21:55:00         0.014468
1           RECOVERING   2018-04-12 21:56:00         0.014468
2           RECOVERING   2018-04-12 21:57:00         0.014468
3           RECOVERING   2018-04-12 21:58:00         0.014468
4           RECOVERING   2018-04-12 21:59:00         0.014468
...                ...                   ...              ...
14479       RECOVERING   2018-07-25 15:11:00        11.335360
14480       RECOVERING   2018-07-25 15:12:00        11.335360
14481       RECOVERING   2018-07-25 15:13:00        11.335360
14482       RECOVERING   2018-07-25 15:14:00        11.335360
14483       RECOVERING   2018-07-25 15:15:00        11.335360

[14484 rows x 3 columns]


In [12]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp just in case
data = data.sort_values(by='timestamp')

# Identify the indices where the machine_status is "BROKEN"
broken_indices = data[data['machine_status'] == 'BROKEN'].index

# Initialize a list to store the minimum performance levels after each failure
min_performance_levels = []

# Loop through each broken index and find the minimum performance after the failure
for idx in broken_indices[:7]:  # Only consider the first 7 failures
    # Find the next index where the machine status is no longer "BROKEN"
    next_idx = data.loc[idx:].index[data.loc[idx:]['machine_status'] != 'BROKEN'][0]

    # Get the minimum performance level between the broken index and the next non-broken index
    min_performance = data.loc[idx:next_idx]['sensor_06'].min()
    min_performance_levels.append(min_performance)

# Print the minimum performance levels after each of the first 7 failures
for i, performance in enumerate(min_performance_levels, 1):
    print(f'Minimum performance after failure {i}: {performance}')


Minimum performance after failure 1: 3.045428
Minimum performance after failure 2: 5.005787000000001
Minimum performance after failure 3: 13.46933
Minimum performance after failure 4: 0.02893518
Minimum performance after failure 5: 11.33536
Minimum performance after failure 6: 0.0289351847022772
Minimum performance after failure 7: 14.18547


In [14]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/content/sample_data/sensor.csv')

# Ensure the timestamp column is in datetime format
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Sort the data by timestamp
data = data.sort_values(by='timestamp')

# Identify indices where machine_status is "BROKEN" or "RECOVERING"
disturbance_indices = data[data['machine_status'].isin(['BROKEN', 'RECOVERING'])].index

# Initialize a list to store results
results = []
seen_performances = set()

# Loop through each disturbance index and find the minimum performance after the disturbance
for idx in disturbance_indices:
    # Find the next index where the machine status is neither "BROKEN" nor "RECOVERING"
    next_idx = data.loc[idx:].index[data.loc[idx:]['machine_status'].isin(['BROKEN', 'RECOVERING']) == False]

    # Ensure we have a valid next index
    if len(next_idx) == 0:
        continue
    next_idx = next_idx[0]

    # Get the minimum performance level between the disturbance index and the next non-disturbance index
    min_performance = data.loc[idx:next_idx]['sensor_06'].min()

    # Check if this minimum performance has been seen before
    if min_performance not in seen_performances:
        # Append the result if the minimum performance is unique
        results.append({
            'Disturbance Type': data.loc[idx, 'machine_status'],
            'Disturbance Timestamp': data.loc[idx, 'timestamp'],
            'Min Performance': min_performance
        })
        seen_performances.add(min_performance)

    # Stop if we have collected data for 7 unique failures
    if len(results) == 7:
        break

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by the disturbance timestamp
results_df = results_df.sort_values(by='Disturbance Timestamp').reset_index(drop=True)

# Display the results
print(results_df)


  Disturbance Type Disturbance Timestamp  Min Performance
0           BROKEN   2018-04-12 21:55:00         0.014468
1       RECOVERING   2018-04-13 11:16:00         0.028935
2       RECOVERING   2018-04-13 11:48:00         6.438078
3       RECOVERING   2018-04-13 13:17:00         6.474247
4       RECOVERING   2018-04-13 13:18:00         6.524884
5       RECOVERING   2018-04-13 13:32:00         6.553819
6       RECOVERING   2018-04-13 13:38:00         6.575521
