<a href="https://colab.research.google.com/github/yxpx/google_colab/blob/main/pypal_insights_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd

file_name = '/content/sample_data/pypal_data.xlsx'
try:
    if file_name.endswith('.csv'):
        df = pd.read_csv(file_name)
    else:
        df = pd.read_excel(file_name)

    print(f"Successfully loaded '{file_name}'.")
    print("Here are the first 5 rows of your data:")
    print(df.head())

except FileNotFoundError:
    print(f"--- ERROR ---")
    print(f"File not found: '{file_name}'.")

except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded '/content/sample_data/pypal_data.xlsx'.
Here are the first 5 rows of your data:
   concept_number  question_number  \
0               5                6   
1               1                1   
2               1                1   
3               1                1   
4               1                2   

                                            question  \
0  Given a dictionary containing the student name...   
1  Given an integer, return True if it is even, e...   
2  Given an integer, return True if it is even, e...   
3  Given an integer, return True if it is even, e...   
4         Given two integers, return the larger one.   

                                                code  \
0  def main(dict1):_x000D_\n\tlist1={}_x000D_\n\t...   
1  def main():_x000D_\n\tif a%2 == 0:_x000D_\n\t\...   
2                   def main():_x000D_\n\treturn a%2   
3                   def main():_x000D_\n\treturn a%2   
4           def main():_x000D_\n\ta-1_x000D_\n\tpass  

In [24]:
import pandas as pd
import numpy as np

if 'df' in locals():
    feedback_str = df['exec_feedback'].astype(str)
    df['status'] = np.where(
        feedback_str.str.contains('error', case=False, na=False),
        'Error',
        'Fail'
    )

    print("Here is the breakdown of all problems:")
    print(df['status'].value_counts())
    print("\n")

else:
    print("--- ERROR --- 'df' is not loaded")

Here is the breakdown of all problems:
status
Fail     9141
Error    5267
Name: count, dtype: int64




In [25]:
if 'df' in locals():
    # This regex pattern looks for words that end in 'Error' or 'error'
    regex_pattern = r'([\w\s]+[Ee]rror)'

    # Use .str.extract to pull out the first match from 'exec_feedback'
    extracted_errors = df['exec_feedback'].astype(str).str.extract(regex_pattern, expand=False)

    # Create the new 'error_type' column ONLY if the status is 'Error'
    df['error_type'] = np.where(
        df['status'] == 'Error',
        extracted_errors,  # Put the extracted error text
        np.nan             # Otherwise, leave it blank
    )

    print("Most common errors are:")
    print(df['error_type'].value_counts(dropna=True).head(10))
else:
    print("--- ERROR --- 'df' is not loaded.")

Most common errors are:
error_type
TypeError            2544
NameError            1247
ZeroDivisionError     607
UnboundLocalError     390
AttributeError        210
EOFError               88
IndexError             81
ValueError             70
KeyError               16
RecursionError          5
Name: count, dtype: int64


In [30]:
import pandas as pd

if 'df' in locals() and 'status' in df.columns:
    # Create the unique ID
    # We convert the numbers to strings so we can join them with a dash
    df['unique_question_id'] = 'C' + df['concept_number'].astype(str) + \
                               '-Q' + df['question_number'].astype(str)

    # Count 'Error' and 'Fail' for each unique question
    question_summary = df.groupby('unique_question_id')['status'].value_counts()

    # .unstack(fill_value=0) moves 'Error' and 'Fail' into their own columns
    question_summary_table = question_summary.unstack(fill_value=0)

    # Add a 'Total' column and sort by it
    question_summary_table['Total_Problems'] = question_summary_table.sum(axis=1)
    question_summary_table = question_summary_table.sort_values(by='Total_Problems', ascending=False)

    print("\n--- Summary of Problems by Question ---")
    print(question_summary_table.head(15))

else:
    print("--- ERROR --- 'df' or 'status' column not found.")


--- Summary of Problems by Question ---
status              Error  Fail  Total_Problems
unique_question_id                             
C1-Q8                 622   923            1545
C1-Q7                 247   972            1219
C4-Q9                  51   659             710
C1-Q1                 558   106             664
C3-Q7                 141   523             664
C6-Q5                 126   361             487
C2-Q1                 185   250             435
C2-Q2                  88   344             432
C3-Q3                 148   279             427
C6-Q7                  94   317             411
C1-Q4                  42   366             408
C4-Q8                  48   283             331
C1-Q9                 262    66             328
C1-Q10                 39   286             325
C4-Q2                  38   250             288


In [45]:
if 'df' in locals() and all(col in df.columns for col in ['status', 'unique_question_id', 'error_type']):

    total_problems = len(df)
    status_counts = df['status'].value_counts()
    error_count = status_counts.get('Error', 0)
    fail_count = status_counts.get('Fail', 0)

    error_percentage = (error_count / total_problems) * 100
    fail_percentage = (fail_count / total_problems) * 100

    error_type_counts = df['error_type'].value_counts(dropna=True)
    top_3_errors = error_type_counts.head(3)
    top_3_errors_percentage = (top_3_errors.sum() / error_count) * 100

    question_counts = df['unique_question_id'].value_counts()
    top_6_questions = question_counts.head(6)
    top_6_questions_percentage = (top_6_questions.sum() / total_problems) * 100

    print("\n===================")
    print("      Summary")
    print("===================")

    print(f"\nAnalyzed a total of {total_problems} problem logs.")
    print(f"\n1. Breakdown of Problem Types:")
    print(f"   - Code-Crashing Errors: {error_count} cases ({error_percentage:.1f}%)")
    print(f"   - Logical Fails:        {fail_count} cases ({fail_percentage:.1f}%)")

    print("\n2.Top Code-Crashing Errors")
    print("   ----------------------------------------")
    print("   The following three error types are the most frequent:")
    for error_name, count in top_3_errors.items():
        print(f"   - {error_name}: {count} occurrences")
    print(f"\n   These three errors account for {top_3_errors_percentage:.1f}% of all code-crashing issues.")

    print("\n3. Key Insight: Top Problematic Questions")
    print("   -----------------------------------------")
    print("   The following six questions generate the most problems:")
    for q_name, count in top_6_questions.items():
        print(f"   - Question {q_name}: {count} problems")
    print(f"\n   These six questions account for {top_6_questions_percentage:.1f}% of all logged problems.")



      Summary

Analyzed a total of 14408 problem logs.

1. Breakdown of Problem Types:
   - Code-Crashing Errors: 5267 cases (36.6%)
   - Logical Fails:        9141 cases (63.4%)

2.Top Code-Crashing Errors
   ----------------------------------------
   The following three error types are the most frequent:
   -  TypeError: 2544 occurrences
   -  NameError: 1247 occurrences
   -  ZeroDivisionError: 607 occurrences

   These three errors account for 83.5% of all code-crashing issues.

3. Key Insight: Top Problematic Questions
   -----------------------------------------
   The following six questions generate the most problems:
   - Question C1-Q8: 1545 problems
   - Question C1-Q7: 1219 problems
   - Question C4-Q9: 710 problems
   - Question C1-Q1: 664 problems
   - Question C3-Q7: 664 problems
   - Question C6-Q5: 487 problems

   These six questions account for 36.7% of all logged problems.
