In [None]:
import requests

# Fetch Current Quiz Data
current_quiz_data_url = "https://www.jsonkeeper.com/b/LLQT"
current_quiz_data = requests.get(current_quiz_data_url).json()

# Fetch Historical Quiz Data
historical_quiz_data_url = "https://api.jsonserve.com/XgAgFJ"
historical_quiz_data = requests.get(historical_quiz_data_url).json()

# Print the first few records to check the data
print("Current Quiz Data Sample:")
print(current_quiz_data)

print("Historical Quiz Data Sample:")
print(historical_quiz_data)

In [None]:
# Explore the structure of the current quiz data
print("Keys in Current Quiz Data:")
print(current_quiz_data.keys())

# Print a sample of the quiz to understand its structure
print("Sample of Current Quiz Data:")
print(current_quiz_data)

In [None]:
# Print the first few items of the Historical Quiz Data
print("First few entries in Historical Quiz Data:")
print(historical_quiz_data[:3])  # Print the first 3 records for inspection

In [None]:
# Check for missing values in each record of the historical quiz data
historical_quiz_data_missing = []
for record in historical_quiz_data:
    missing_fields = [key for key, value in record.items() if not value]
    if missing_fields:
        historical_quiz_data_missing.append({record['id']: missing_fields})

# Print missing fields for each quiz record
print("Missing fields in Historical Quiz Data:")
print(historical_quiz_data_missing)

In [None]:
# Check for missing values in the current quiz data
current_quiz_data_missing = [key for key, value in current_quiz_data.items() if not value]
print("Missing fields in Current Quiz Data:", current_quiz_data_missing)

In [None]:
import pandas as pd

# Convert historical quiz data (list of dictionaries) into a DataFrame
historical_df = pd.DataFrame(historical_quiz_data)

# Check the structure of the DataFrame to ensure everything is loaded correctly
print(historical_df.head())

In [None]:
from pandas import json_normalize

# Normalize the Current Quiz Data dictionary
current_quiz_df = json_normalize(current_quiz_data)

# Check the structure of the Current Quiz DataFrame
print("Current Quiz DataFrame:")
print(current_quiz_df)

In [None]:
# Group by 'quiz' to analyze topic-wise performance in Historical Data
# Assuming 'quiz' column actually contains a dictionary with a key like 'topic'
# that represents the quiz topic
topic_performance_historical = historical_df.groupby(historical_df['quiz'].apply(lambda x: x['topic'])).agg(
    avg_accuracy=('accuracy', lambda x: pd.to_numeric(x.str.rstrip(' %')).mean()),  # Convert accuracy to numeric before calculating mean
    total_correct_answers=('correct_answers', 'sum'),
    total_incorrect_answers=('incorrect_answers', 'sum'),
    total_score=('score', 'mean')
).reset_index()

print("Topic Performance (Historical):")
print(topic_performance_historical)

In [None]:
# Extract relevant fields from the Current Quiz Data
current_topic = current_quiz_df['quiz.topic'].iloc[0]  # Accessing nested 'topic'
questions = current_quiz_df['quiz.questions'].iloc[0]

# Example: Calculate total questions and answers (assuming responses are available)
total_questions = len(questions)
correct_answers = sum([1 for q in questions if q.get('is_correct')])
accuracy = correct_answers / total_questions if total_questions > 0 else 0

print(f"Current Quiz - Topic: {current_topic}, Accuracy: {accuracy:.2%}, Correct Answers: {correct_answers}/{total_questions}")

In [None]:
# Example: Analyze performance by difficulty level in Current Quiz Data
difficulty_levels = [q.get('difficulty') for q in questions if q.get('difficulty')]

# Count questions by difficulty (if available)
difficulty_counts = pd.Series(difficulty_levels).value_counts()
print("Current Quiz - Question Count by Difficulty:")
print(difficulty_counts)

In [None]:
import pandas as pd

# Assuming 'initial_mistake_count' and 'better_than' columns exist in historical_df

# Calculate a difficulty score
historical_df['difficulty_score'] = (historical_df['initial_mistake_count'] / historical_df['better_than'])

# Define difficulty levels based on score ranges
def assign_difficulty(score):
    if score <= 0.2:
        return 'Easy'
    elif score <= 0.5:
        return 'Medium'
    else:
        return 'Hard'

historical_df['difficulty_level'] = historical_df['difficulty_score'].apply(assign_difficulty)

# Group by difficulty level and calculate performance metrics
# Assuming 'accuracy' column contains the accuracy values as strings with '%'
historical_df['calculated_accuracy'] = pd.to_numeric(historical_df['accuracy'].str.rstrip('%')) / 100  # Convert accuracy to numeric
difficulty_performance = historical_df.groupby('difficulty_level').agg(
    avg_accuracy=('calculated_accuracy', 'mean'),
    total_correct_answers=('correct_answers', 'sum'),
    total_score=('score', 'mean')
).reset_index()

print("Difficulty-Level Performance:")
print(difficulty_performance)

In [None]:
# Calculate correlations between numeric columns, excluding non-numeric columns
correlations = historical_df.select_dtypes(include=['number']).corr()

# Print the correlation matrix
print(correlations)

In [None]:
# Convert 'submitted_at' to datetime objects
historical_df['submitted_at'] = pd.to_datetime(historical_df['submitted_at'])

# Group by time periods (e.g., day, week, month) and calculate performance metrics
time_based_performance = historical_df.groupby(pd.Grouper(key='submitted_at', freq='W')).agg(
    avg_accuracy=('calculated_accuracy', 'mean'),
    total_score=('score', 'mean')
).reset_index()

# Print the time-based performance
print(time_based_performance)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Assuming 'topic_performance' DataFrame from previous analysis
plt.figure(figsize=(10, 6))
sns.barplot(x='quiz', y='avg_accuracy', data=topic_performance_historical)
plt.title('Average Accuracy by Topic')
plt.xlabel('Topic')
plt.ylabel('Average Accuracy')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.tight_layout()
plt.show()

In [None]:
# Assuming 'difficulty_performance' DataFrame from previous analysis
plt.figure(figsize=(8, 5))
sns.barplot(x='difficulty_level', y='avg_accuracy', data=difficulty_performance)
plt.title('Average Accuracy by Difficulty Level')
plt.xlabel('Difficulty Level')
plt.ylabel('Average Accuracy')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='difficulty_score', y='calculated_accuracy', data=historical_df)
plt.title('Correlation between Difficulty and Accuracy')
plt.xlabel('Difficulty Score')
plt.ylabel('Calculated Accuracy')
plt.tight_layout()
plt.show()

In [None]:
# Assuming 'time_based_performance' DataFrame from previous analysis
plt.figure(figsize=(12, 6))
sns.lineplot(x='submitted_at', y='avg_accuracy', data=time_based_performance)
plt.title('Average Accuracy Over Time')
plt.xlabel('Date')
plt.ylabel('Average Accuracy')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
user_id = 'your_user_id'

In [None]:
user_data = historical_df[historical_df['id'] == user_id]

In [None]:
user_topic_performance = user_data.groupby(user_data['quiz'].apply(lambda x: x['topic'])).agg(
    avg_accuracy=('accuracy', lambda x: pd.to_numeric(x.str.rstrip(' %')).mean())
).reset_index()

weak_topics = user_topic_performance[user_topic_performance['avg_accuracy'] < topic_performance_historical['avg_accuracy'].mean()]
print("Weak Topics for the User:")
print(weak_topics)

In [None]:
import requests

# Fetch the submission data
submission_url = "https://api.jsonserve.com/rJvd7g"
submission_data = requests.get(submission_url).json()

# Print the structure of the submission data
print("Current Quiz Submission Data:")
print(submission_data)

In [None]:
import pandas as pd

# Extract metadata and response map
quiz_metadata = submission_data['quiz']
response_map = submission_data['response_map']

# Create a DataFrame for response map
response_df = pd.DataFrame(list(response_map.items()), columns=['question_id', 'selected_option'])

# Add topic and performance metrics to the DataFrame
response_df['topic'] = quiz_metadata['topic']
response_df['accuracy'] = float(submission_data['accuracy'].strip('%')) / 100
response_df['correct_answers'] = submission_data['correct_answers']
response_df['incorrect_answers'] = submission_data['incorrect_answers']

print("Response DataFrame:")
print(response_df.head())

In [None]:
# Historical topic performance
historical_topics = topic_performance_historical[['quiz', 'avg_accuracy', 'total_correct_answers']]
historical_topics = historical_topics.rename(columns={'quiz': 'topic'})

# Ensure 'is_correct' column exists and is of numeric type in current_quiz_df
if 'quiz.questions' in current_quiz_df.columns:
    current_quiz_df = current_quiz_df.explode('quiz.questions')  # Explode nested questions
    current_quiz_df['is_correct'] = current_quiz_df['quiz.questions'].apply(lambda x: x.get('is_correct', False)).astype(bool)

# Group by 'quiz.topic' to get current topic performance
current_topic_performance = current_quiz_df.groupby('quiz.topic')['is_correct'].mean().reset_index()
current_topic_performance = current_topic_performance.rename(columns={'quiz.topic': 'topic', 'is_correct': 'current_accuracy'})

# Merge historical and current topic performance
topic_comparison = historical_topics.merge(
    current_topic_performance,
    on='topic',
    how='outer',
    suffixes=('_historical', '_current')
)

# Add trend data
topic_comparison['trend'] = topic_comparison['current_accuracy'] - topic_comparison['avg_accuracy']

# Identify weak areas (accuracy < 70%)
weak_topics = topic_comparison[topic_comparison['current_accuracy'] < 0.7]

print("Topic Performance Comparison:")
print(topic_comparison)
print("Weak Topics:")
print(weak_topics)

In [None]:
# Assuming we have historical_df as a DataFrame containing the historical quiz data

user_id = 'YcDFSO4ZukTJnnFMgRNVwZTE4j42'  # Replace with the actual user ID

# Filter the historical data for the specific user
user_data = historical_df[historical_df['user_id'] == user_id]

# Inspect the filtered data
print("User Data:")
print(user_data[['submitted_at', 'accuracy', 'speed', 'initial_mistake_count', 'mistakes_corrected']])

In [None]:
import matplotlib.pyplot as plt

# Convert 'submitted_at' to datetime for better plotting
user_data['submitted_at'] = pd.to_datetime(user_data['submitted_at'])

# Plot the accuracy over time
plt.figure(figsize=(10, 6))
plt.plot(user_data['submitted_at'], user_data['accuracy'].str.rstrip('%').astype(float), marker='o', label='Accuracy')
plt.title("Accuracy Trend for User")
plt.xlabel("Quiz Date")
plt.ylabel("Accuracy (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Plot the speed (in percentage) over time
plt.figure(figsize=(10, 6))
plt.plot(user_data['submitted_at'], user_data['speed'].astype(float), marker='o', color='green', label='Speed')
plt.title("Speed Trend for User")
plt.xlabel("Quiz Date")
plt.ylabel("Speed (%)")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Plot initial mistakes and mistakes corrected over time
plt.figure(figsize=(10, 6))

plt.plot(user_data['submitted_at'], user_data['initial_mistake_count'], marker='o', label='Initial Mistakes')
plt.plot(user_data['submitted_at'], user_data['mistakes_corrected'], marker='o', label='Mistakes Corrected')

plt.title("Mistakes Over Time for User")
plt.xlabel("Quiz Date")
plt.ylabel("Mistakes Count")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Calculate percentage change in accuracy, speed, and mistakes corrected
user_data['accuracy_change'] = user_data['accuracy'].str.rstrip('%').astype(float).pct_change() * 100
user_data['speed_change'] = user_data['speed'].astype(float).pct_change() * 100
user_data['mistakes_corrected_change'] = user_data['mistakes_corrected'].pct_change() * 100

print("Improvement Metrics (Percentage Change):")
print(user_data[['submitted_at', 'accuracy_change', 'speed_change', 'mistakes_corrected_change']])

identifying weak areas

In [None]:
# Define thresholds
accuracy_threshold = 70  # Accuracy less than 70%
mistakes_threshold = 5   # High initial mistakes

# Filter for low accuracy and high mistakes
weak_areas = user_data[
    (user_data['accuracy'].str.rstrip('%').astype(float) < accuracy_threshold) &
    (user_data['initial_mistake_count'] > mistakes_threshold)
]

print("Weak Areas (Low Accuracy & High Mistakes):")
print(weak_areas[['submitted_at', 'accuracy', 'initial_mistake_count', 'quiz']])

In [None]:
# Extract the topic from the 'quiz' column (assuming it's a dictionary with a 'topic' key)
user_data['quiz_topic'] = user_data['quiz'].apply(lambda x: x['topic'])

# Group by the extracted topic
topic_performance = user_data.groupby('quiz_topic').agg({
    'accuracy': lambda x: x.str.rstrip('%').astype(float).mean(),
    'initial_mistake_count': 'mean'
}).reset_index()

# Flag topics where average accuracy is below the threshold (70%)
consistent_weak_topics = topic_performance[topic_performance['accuracy'] < accuracy_threshold]

print("Consistent Weak Topics Across Quizzes (Low Accuracy):")
print(consistent_weak_topics[['quiz_topic', 'accuracy']]) # Changed 'quiz' to 'quiz_topic'

In [None]:
import pandas as pd

# Define a threshold for difficulty-level struggles (accuracy below 70% in hard quizzes)
difficulty_accuracy_threshold = 70

# Assuming 'initial_mistake_count' and 'better_than' columns exist in user_data

# Calculate a difficulty score
user_data['difficulty_score'] = (user_data['initial_mistake_count'] / user_data['better_than'])

# Define difficulty levels based on score ranges
def assign_difficulty(score):
    if score <= 0.2:
        return 'Easy'
    elif score <= 0.5:
        return 'Medium'
    else:
        return 'Hard'

user_data['difficulty_level'] = user_data['difficulty_score'].apply(assign_difficulty)

# Filter for quizzes where difficulty level is 'hard' and accuracy is low
difficulty_struggles = user_data[
    (user_data['difficulty_level'] == 'hard') &
    (user_data['accuracy'].str.rstrip('%').astype(float) < difficulty_accuracy_threshold)
]

print("Difficulty-Level Struggles (Hard Quizzes with Low Accuracy):")
print(difficulty_struggles[['submitted_at', 'accuracy', 'difficulty_level', 'quiz']])

In [None]:
# Calculate percentage change for accuracy, speed, and mistakes
user_data['accuracy_change'] = user_data['accuracy'].str.rstrip('%').astype(float).pct_change() * 100
user_data['mistakes_corrected_change'] = user_data['mistakes_corrected'].pct_change() * 100
user_data['speed_change'] = user_data['speed'].astype(float).pct_change() * 100

# Identify negative performance trends (negative change in accuracy, speed, or mistakes corrected)
negative_trends = user_data[
    (user_data['accuracy_change'] < 0) |
    (user_data['mistakes_corrected_change'] < 0) |
    (user_data['speed_change'] < 0)
]

print("Negative Performance Trends:")
print(negative_trends[['submitted_at', 'accuracy_change', 'mistakes_corrected_change', 'speed_change', 'quiz']])

In [None]:
# Combine weak areas, difficulty struggles, and negative trends into one list of recommendations
recommendations = []

# Add recommendations based on weak areas (low accuracy and high mistakes)
for _, row in weak_areas.iterrows():
    recommendations.append(f"Focus on improving accuracy in the topic '{row['quiz']['topic']}' where accuracy was {row['accuracy']} and initial mistakes were {row['initial_mistake_count']}.")  # Accessing nested 'topic'

# Add recommendations based on difficulty struggles
for _, row in difficulty_struggles.iterrows():
    recommendations.append(f"Practice more questions of '{row['difficulty_level']}' difficulty, especially in the topic '{row['quiz']['topic']}', where accuracy was low.")

# Add recommendations based on consistent weak topics
for _, row in consistent_weak_topics.iterrows():
    recommendations.append(f"Revise the topic '{row['quiz_topic']}' as your average accuracy in this topic is {row['accuracy']:.2f}%, which is below the expected threshold.")

# Add recommendations based on negative trends
for _, row in negative_trends.iterrows():
    recommendation = "Revisit topics where performance has declined, specifically: "
    if row['accuracy_change'] < 0:
        recommendation += f"accuracy in '{row['quiz']['topic']}' "
    if row['mistakes_corrected_change'] < 0:
        recommendation += f"mistake correction in '{row['quiz']['topic']}' "
    if row['speed_change'] < 0:
        recommendation += f"speed in '{row['quiz']['topic']}' "
    recommendations.append(recommendation)

# Remove duplicate recommendations
recommendations = list(dict.fromkeys(recommendations))

# Print all recommendations
print("Generated Recommendations:")
for i, recommendation in enumerate(recommendations):
    print(f"{i+1}. {recommendation}")

**VISUALISING ACCURACY TRENDS**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'submitted_at' to datetime if it's not already
if not pd.api.types.is_datetime64_any_dtype(user_data['submitted_at']):
    user_data['submitted_at'] = pd.to_datetime(user_data['submitted_at'])

# Plot accuracy trend over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='submitted_at', y='accuracy', data=user_data, marker='o', color='blue')
plt.title("User's Accuracy Trend Over Time")
plt.xlabel("Quiz Date")
plt.ylabel("Accuracy (%)")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.grid(True)
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()

generating recommendations

In [None]:
def generate_recommendations(user_data, topic_performance_historical, accuracy_threshold=70, mistakes_threshold=5):
    """
    Generates personalized recommendations for the user based on quiz performance.

    Args:
        user_data (pd.DataFrame): DataFrame containing the user's historical quiz data.
        topic_performance_historical (pd.DataFrame): DataFrame containing historical topic performance.
        accuracy_threshold (int, optional): Accuracy threshold below which a topic is considered weak. Defaults to 70.
        mistakes_threshold (int, optional): Threshold for high initial mistakes. Defaults to 5.

    Returns:
        list: A list of personalized recommendations for the user.
    """

    recommendations = []

    # 1. Weak Areas (Low Accuracy & High Mistakes)
    weak_areas = user_data[
        (user_data['accuracy'].str.rstrip('%').astype(float) < accuracy_threshold) &
        (user_data['initial_mistake_count'] > mistakes_threshold)
    ]
    for _, row in weak_areas.iterrows():
        recommendations.append(f"Focus on improving accuracy in the topic '{row['quiz']['topic']}' where accuracy was {row['accuracy']} and initial mistakes were {row['initial_mistake_count']}.")

    # 2. Consistent Weak Topics (Low Accuracy Across Quizzes)
    user_data['quiz_topic'] = user_data['quiz'].apply(lambda x: x['topic'])
    topic_performance = user_data.groupby('quiz_topic').agg({
        'accuracy': lambda x: x.str.rstrip('%').astype(float).mean(),
        'initial_mistake_count': 'mean'
    }).reset_index()
    consistent_weak_topics = topic_performance[topic_performance['accuracy'] < accuracy_threshold]
    for _, row in consistent_weak_topics.iterrows():
        recommendations.append(f"Revise the topic '{row['quiz_topic']}' as your average accuracy in this topic is {row['accuracy']:.2f}%, which is below the expected threshold.")

    # 3. Difficulty-Level Struggles (Hard Quizzes with Low Accuracy)
    user_data['difficulty_score'] = (user_data['initial_mistake_count'] / user_data['better_than'])
    user_data['difficulty_level'] = user_data['difficulty_score'].apply(lambda score: 'Easy' if score <= 0.2 else 'Medium' if score <= 0.5 else 'Hard')
    difficulty_struggles = user_data[
        (user_data['difficulty_level'] == 'Hard') &
        (user_data['accuracy'].str.rstrip('%').astype(float) < accuracy_threshold)
    ]
    for _, row in difficulty_struggles.iterrows():
        recommendations.append(f"Practice more questions of '{row['difficulty_level']}' difficulty, especially in the topic '{row['quiz']['topic']}', where accuracy was low.")

    # 4. Negative Performance Trends (Declining Accuracy, Speed, or Mistakes Corrected)
    user_data['accuracy_change'] = user_data['accuracy'].str.rstrip('%').astype(float).pct_change() * 100
    user_data['mistakes_corrected_change'] = user_data['mistakes_corrected'].pct_change() * 100
    user_data['speed_change'] = user_data['speed'].astype(float).pct_change() * 100
    negative_trends = user_data[
        (user_data['accuracy_change'] < 0) |
        (user_data['mistakes_corrected_change'] < 0) |
        (user_data['speed_change'] < 0)
    ]
    for _, row in negative_trends.iterrows():
        recommendation = "Revisit topics where performance has declined, specifically: "
        if row['accuracy_change'] < 0:
            recommendation += f"accuracy in '{row['quiz']['topic']}' "
        if row['mistakes_corrected_change'] < 0:
            recommendation += f"mistake correction in '{row['quiz']['topic']}' "
        if row['speed_change'] < 0:
            recommendation += f"speed in '{row['quiz']['topic']}' "
        recommendations.append(recommendation)

    # Remove duplicate recommendations
    recommendations = list(dict.fromkeys(recommendations))

    return recommendations

# Example usage:
recommendations = generate_recommendations(user_data, topic_performance_historical)
print("Generated Recommendations:")
for i, recommendation in enumerate(recommendations):
    print(f"{i + 1}. {recommendation}")

In [None]:
import requests
import pandas as pd
import numpy as np

# Fetch the data
historical_quiz_data_url = "https://api.jsonserve.com/XgAgFJ"
submission_data_url = "https://api.jsonserve.com/rJvd7g"
current_quiz_data_url = "https://www.jsonkeeper.com/b/LLQT"

historical_quiz_data = requests.get(historical_quiz_data_url).json()
submission_data = requests.get(submission_data_url).json()
current_quiz_data = requests.get(current_quiz_data_url).json()

# Convert historical quiz data to DataFrame
historical_df = pd.DataFrame(historical_quiz_data)

# Get all unique user IDs
all_user_ids = historical_df['user_id'].unique()

# Function to identify weak areas (from previous code)
def identify_weak_areas(user_data, accuracy_threshold=70, mistakes_threshold=5):
    weak_areas = user_data[
        (user_data['accuracy'].str.rstrip('%').astype(float) < accuracy_threshold) &
        (user_data['initial_mistake_count'] > mistakes_threshold)
    ]
    return weak_areas

# Function to identify consistent weak topics (from previous code)
def identify_consistent_weak_topics(user_data, accuracy_threshold=70):
    user_data['quiz_topic'] = user_data['quiz'].apply(lambda x: x.get('topic', 'Unknown')) # Handle missing 'topic'
    topic_performance = user_data.groupby('quiz_topic').agg({
        'accuracy': lambda x: x.str.rstrip('%').astype(float).mean(),
        'initial_mistake_count': 'mean'
    }).reset_index()
    consistent_weak_topics = topic_performance[topic_performance['accuracy'] < accuracy_threshold]
    return consistent_weak_topics

# Function to identify difficulty-level struggles (from previous code)
def identify_difficulty_struggles(user_data, accuracy_threshold=70):
    user_data['difficulty_score'] = (user_data['initial_mistake_count'] / user_data['better_than'])
    user_data['difficulty_level'] = user_data['difficulty_score'].apply(lambda score: 'Easy' if score <= 0.2 else 'Medium' if score <= 0.5 else 'Hard')
    difficulty_struggles = user_data[
        (user_data['difficulty_level'] == 'Hard') &
        (user_data['accuracy'].str.rstrip('%').astype(float) < accuracy_threshold)
    ]
    return difficulty_struggles

# Function to identify negative performance trends (from previous code)
def identify_negative_trends(user_data):
    user_data['accuracy_change'] = user_data['accuracy'].str.rstrip('%').astype(float).pct_change() * 100
    user_data['mistakes_corrected_change'] = user_data['mistakes_corrected'].pct_change() * 100
    user_data['speed_change'] = user_data['speed'].astype(float).pct_change() * 100
    # Replace infinite values with NaN for easier filtering
    user_data.replace([np.inf, -np.inf], np.nan, inplace=True)

    negative_trends = user_data[
        (user_data['accuracy_change'].fillna(0) < 0) |  # Handle NaN with fillna(0)
        (user_data['mistakes_corrected_change'].fillna(0) < 0) |
        (user_data['speed_change'].fillna(0) < 0)
    ]
    return negative_trends


# Function to generate user persona (updated)
def generate_user_persona(user_id, historical_df, submission_data, current_quiz_data):
    """Generates a user persona based on the provided data."""
    # Filter data for the specific user
    user_data = historical_df[historical_df['user_id'] == user_id]

    # Calculate average accuracy
    avg_accuracy = user_data['accuracy'].str.rstrip('%').astype(float).mean()

    # Identify preferred topics (example)
    preferred_topics = user_data['quiz'].apply(lambda x: x.get('topic', 'Unknown')).value_counts().index.tolist() # Handle missing 'topic'

    # Basic learning style (example)
    avg_speed = user_data['speed'].astype(float).mean()
    learning_style = "Fast" if avg_speed > user_data['speed'].astype(float).quantile(0.75) else "Moderate"

    # Analyze strengths and weaknesses
    strengths = []
    weaknesses = []

    if avg_accuracy > 80:
        strengths.append("High Overall Accuracy")
    if avg_speed > user_data['speed'].astype(float).quantile(0.75):
        strengths.append("Fast Quiz Completion")
    # ... (Add more strength analysis)

    # Call weakness identification functions
    weak_areas = identify_weak_areas(user_data)
    consistent_weak_topics = identify_consistent_weak_topics(user_data)
    difficulty_struggles = identify_difficulty_struggles(user_data)
    negative_trends = identify_negative_trends(user_data)

    # Combine weaknesses into a list
    weaknesses = []  # Initialize an empty list
    weaknesses.extend([
        f"Struggles with accuracy in '{row['quiz']['topic']}' (accuracy: {row['accuracy']}, initial mistakes: {row['initial_mistake_count']})"
        for _, row in weak_areas.iterrows()
    ])
    weaknesses.extend([
        f"Consistently underperforms in '{row['quiz_topic']}' (average accuracy: {row['accuracy']:.2f}%)"
        for _, row in consistent_weak_topics.iterrows()
    ])
    weaknesses.extend([
        f"Faces challenges with 'Hard' difficulty questions, especially in '{row['quiz']['topic']}'"
        for _, row in difficulty_struggles.iterrows()
    ])
    weaknesses.extend([
        "Shows declining performance in: " +
        ", ".join([f"{metric} in '{row['quiz']['topic']}'"
                   for metric in ['accuracy', 'mistakes corrected', 'speed']
                   if row[f"{metric.replace(' ', '_')}_change"] < 0])
        for _, row in negative_trends.iterrows()
    ])

    persona = {
      'user_id': user_id,
      'average_accuracy': avg_accuracy,
      'preferred_topics': preferred_topics,
      'learning_style': learning_style,
      'strengths': strengths,
      'weaknesses': weaknesses  # Use the list of weaknesses
    }

    return persona

# Generate personas for all users
user_personas = {}
for user_id in all_user_ids:
    user_personas[user_id] = generate_user_persona(user_id, historical_df, submission_data, current_quiz_data)

# Print the personas
for user_id, persona in user_personas.items():
    print(f"User Persona for {user_id}:")
    print(persona)
    print("-" * 20)  # Separator