# RTB Determination Analysis

In [None]:
import os
import pandas as pd
from dateutil import parser
import matplotlib.pyplot as plt

In [None]:
# Directory containing CSV files
folder_path = "../data/summary"

In [None]:
# Initialize an empty list to store DataFrames
dfs = []

# Loop through each file in the directory
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        
        # Read CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv("../data/case_metadata_combined.csv", index=False)

In [None]:
combined_df.head()

In [None]:
combined_df.info()

In [None]:
# Exclude rows with NaN values in the "DR No." column
combined_df = combined_df.dropna(subset=["DR No."])

# Find rows with duplicate values in the "TR No." column
duplicate_dr_df = combined_df[combined_df.duplicated(subset="DR No.", keep=False)]

# Display the DataFrame with duplicate rows
duplicate_dr_df

In [None]:
# Exclude rows with NaN values in the "TR No." column
combined_df = combined_df.dropna(subset=["TR No."])

# Find rows with duplicate values in the "TR No." column
duplicate_tr_df = combined_df[combined_df.duplicated(subset="TR No.", keep=False)]

# Display the DataFrame with duplicate rows
duplicate_tr_df

In [None]:
# Parse "Upload Date" column using dateutil.parser
combined_df["Upload Date"] = combined_df["Upload Date"].apply(lambda x: parser.parse(x).strftime("%d/%m/%Y"))
combined_df.info()

In [None]:
# Calculate counts of non-null and null values for each column
non_null_counts = combined_df.notnull().sum()
null_counts = combined_df.isnull().sum()

# Plotting
plt.figure(figsize=(10, 6))
non_null_counts.plot(kind="bar", color="skyblue", label="Populated")
null_counts.plot(kind="bar", bottom=non_null_counts, color="orange", label="Null")
plt.title("Populated vs Null Values for Each Column")
plt.xlabel("Columns")
plt.ylabel("Count")
plt.xticks(rotation=45, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
combined_df.head()

In [None]:
# Count unique values and total values for each column
unique_counts = combined_df.nunique()
total_counts = combined_df.count()

# Combine both counts into a single DataFrame
counts_df = pd.DataFrame({"Total Values": total_counts, "Unique Values": unique_counts})

print("Count of unique and total values for each column:")
print(counts_df)

In [None]:
# Plotting
colors = ["skyblue", "orange"]
counts_df.plot(kind="bar", figsize=(10, 6), color=colors)
plt.title("Count of Total and Unique Values for Each Column")
plt.xlabel("Columns")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.legend(title="Counts", loc="upper right")
plt.show()

In [None]:
# Group by "Upload Date" and count the number of uploads for each date
uploads_over_time = combined_df.groupby("Upload Date").size()

# Plotting
plt.figure(figsize=(10, 6))
uploads_over_time.plot()
plt.title("Uploads Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Uploads")
plt.grid(True)
plt.show()

In [None]:
# Calculate frequency counts for the "Subject" column
subject_counts = combined_df["Subject"].value_counts()

# Display frequency counts for the "Subject" column
print("Frequency counts for the Subject column:")
print(subject_counts)


In [None]:
# Select top twenty subjects
top_twenty_subjects = subject_counts.head(10)

# Plotting
plt.figure(figsize=(10, 6))
top_twenty_subjects.plot(kind="bar")
plt.title("Top Ten Subjects")
plt.xlabel("Subjects")
plt.ylabel("Frequency")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
