In [1]:
import pandas as pd
import numpy as np
import ast, json
import os
import statsmodels.api as sm
import warnings

from imblearn.pipeline import Pipeline
from numpy import where
from statistics import *
from scipy.stats import chi2_contingency
from numpy import loadtxt
from matplotlib import pyplot
from google.colab import drive
from google.colab import files

drive.mount('/content/drive')
directory = '/content/drive/MyDrive/Research/Aqualab/Pipeline'  ###### 【Customizable item】
os.chdir(directory)

Mounted at /content/drive


In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.display.float_format = '{:.10f}'.format

# I. Pre-process log data

- Please skip this session if all .tsv files have already converted to .csv format for regression/chi-square test
- Please make sure that all the .tsv files are in the current working directory

In [None]:
def data_process(data):
    # Filter out rows where app_branch is 'develop'
    data = data[data['app_branch'] != 'develop']
    data = data.reset_index(drop=True)

    # Sort values by user_id and timestamp
    data = data.sort_values(by=['user_id', 'timestamp'])

    # Filter out rows where event_name is 'load_error'
    data = data[data['event_name'] != 'load_error']
    return data

def extract_job_name(game_state):
    # extract job_name from 'game_state'
    try:
        game_state_dict = json.loads(game_state)
        return game_state_dict.get('job_name', '')
    except json.JSONDecodeError:
        return ''

def data_clean(data):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data[data['job_name'] != 'no-active-job']
    data = data.reset_index(drop=False)
    data = data.rename(columns={'level_0': 'id_org'})
    return data

def data_chunk(data):
    # Task end when: {job differs}
    data["task_status"] = data["job_name"].shift(1,fill_value=data["job_name"].head(1)) != data["job_name"]

    # Task end when: {student changes}
    data["student_change"] = data["user_id"].shift(1, fill_value=data["user_id"].head(1)) != data["user_id"]
    data.loc[data["student_change"] == True, "task_status"] = True


    # Create task_id: whenever task status changes, task id is incremented by 1 (i.e., sum all True)
    data['task_id'] = data['task_status'].cumsum()
    data['task_id'] += 1
    data['task_id'] = data['task_id'].astype(int)

    # Drop task_ids that has only one row in the log file
    task_id_counts = data['task_id'].value_counts()
    data = data[data['task_id'].map(task_id_counts) > 1]
    if 'id_org' in data.columns:
        data = data.drop(columns=['id_org'], axis = 1)
    data = data.reset_index(drop=True)

    data = data.rename(columns={'job_name': 'job_string'})

    # Filter rows where 'event_name' is not equal to 'complete_job' for each 'task_id'
    filtered_data = data[data['event_name'] != 'complete_job']

    # Find 'task_id' where 'user_id' is the same but 'job_string' is different in the next 'task_id'
    task_ids = []
    for index, row in filtered_data.iterrows():
        current_task_id = row['task_id']
        current_user_id = row['user_id']
        current_job_string = row['job_string']

        next_row = data.iloc[index + 1] if index + 1 < len(data) else None

        if next_row is not None and next_row['user_id'] == current_user_id and next_row['job_string'] != current_job_string:
            task_ids.append(current_task_id)

    return data


def refine_data(data):
    # Identifying users to remove: those who only have 'kelp-welcome' as their job_string
    users_to_remove = data.groupby('user_id')['job_string'].apply(lambda x: all(x == 'kelp-welcome'))
    users_to_remove = users_to_remove[users_to_remove].index
    data = data[~data['user_id'].isin(users_to_remove)]

    # Remove user_id == 'default'
    data = data[data['user_id'] != 'default']

    # Adding 'task' column: counts 'complete_task' events per user per task_id
    data['task'] = data.groupby(['user_id', 'task_id'])['event_name'].transform(lambda x: x.eq('complete_task').sum())

    # Adding 'job' column: counts 'complete_job' events per user per task_id
    data['job'] = data.groupby(['user_id', 'task_id'])['event_name'].transform(lambda x: x.eq('complete_job').sum())

    # Adding 'session' column: counts 'session_id' events per user per job_string
    data['session'] = data.groupby(['user_id', 'job_string'])['session_id'].transform(lambda x: x.nunique())

    return data


def aggregate_rows(group):
    # If any event is complete_job in this task_id, then this task_id has event_change column as 'Not Swapped'
    if any(event == 'complete_job' for event in group['event_name'].values):
        group['event_change'] = 'Not Swapped'
    # If no event is complete_job in this task_id, then this task_id has event_change column as 'Swapped'
    else:
        group['event_change'] = 'Swapped'

    total_time_diff = 0

    # Loop through each session_id within the group
    for session_id, session_group in group.groupby('session_id'):
        # Calculate the time difference for the current session_id
        time_diff = session_group['timestamp'].max() - session_group['timestamp'].min()
        total_time_diff += abs(time_diff.total_seconds())  # Add the absolute value of time difference in seconds to total

    # Assign the total time difference to the group
    group['time_diff'] = total_time_diff

    # Return the first row of the group with the required columns including the calculated total time difference
    return group[['user_id', 'job_string', 'task_id', 'event_change', 'time_diff', 'session', 'task', 'job']].iloc[0]


# Function to update the last row of each user_id
def update_last_row(group):
    group.iloc[-1, group.columns.get_loc('event_change')] = 'Not Swapped'
    return group



# Takes all .tsv file in the working directory and convert to .csv
for filename in os.listdir('.'):

    if filename.endswith('.tsv'):

        data = pd.read_csv(filename, sep='\t')

        data = data_process(data)
        data['job_name'] = data['game_state'].apply(extract_job_name)
        data = data_clean(data)
        data = data_chunk(data)


        data = refine_data(data)

        data = data.groupby('task_id').apply(aggregate_rows).reset_index(drop=True)
        data = data.groupby('user_id').apply(update_last_row)
        data = data.reset_index(drop=True)

        data.to_csv(f"progression_{filename.split('.')[0]}.csv", index=False)
        files.download(f"progression_{filename.split('.')[0]}.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# II. Chi-square test

- Please make sure that the .txt file for all job difficulty parameters are in the current working directory
- Level B (LevelB) is customizable
- Criteria for selecting Level As is customizable (threshold_yes, threshold_no)
- At the end of the session, the program will sort all previous levels based on the chi-square statistics from high to low. It will then retain only a customizable number, n, of previous jobs for Session III.


In [3]:
def diff_parameter(path):
    para_list = []

    with open(path, 'r') as file:
        data = file.read()

    # Split the data into jobs
    jobs = data.split('\n')

    for job in jobs:
        if job.startswith('Job:'):
            job_info = {'Job': '', 'Experimentation': 0, 'Modeling': 0, 'Argumentation': 0}
            job_info['Job'] = job.split('Job:')[1].strip()
        elif job.startswith('\tExperimentation:'):
            job_info['Experimentation'] = int(job.split('\tExperimentation:')[1].strip())
        elif job.startswith('\tModeling:'):
            job_info['Modeling'] = int(job.split('\tModeling:')[1].strip())
        elif job.startswith('\tArgumentation:'):
            job_info['Argumentation'] = int(job.split('\tArgumentation:')[1].strip())

        para_list.append(job_info)  # Append the dictionary to the list

    # Convert the list of dictionaries into a DataFrame
    para = pd.DataFrame(para_list)

    return para


def concatenate_csv_files():
    # Get all .csv files in the current working directory
    csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]

    dfs = []

    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        dfs.append(df)
        concatenated_df = pd.concat(dfs, ignore_index=True)

    return concatenated_df

def process_progression_data(data):

    data['time_diff'] = pd.to_numeric(data['time_diff'], errors='coerce')
    data['job'] = pd.to_numeric(data['job'], errors='coerce')
    data['task'] = pd.to_numeric(data['task'], errors='coerce')

    data = data.drop(["task_id"], axis=1)
    data = data.drop_duplicates()

    data.reset_index(drop=True, inplace=True)

    # Replace "Not Swapped" with NaN, and "Swapped" with "switched"
    data = data.replace("Not Swapped", np.nan)
    data = data.replace("Swapped", "switched")

    # Combine 'job_string' and 'event_change' into 'job_status'
    data['job_status'] = data.apply(lambda row: str(row['job_string']) + f" ({str(row['event_change'])})" if not pd.isna(row['event_change']) else row['job_string'], axis=1)

    # Replace NaN values with "completed"
    data = data.replace(np.nan, "completed")

    # Filter out rows with 'job_string' equal to 'kelp-welcome' and reset index
    data = data[data['job_string'] != 'kelp-welcome'].reset_index(drop=True)

    # Filter out users with less than 2 entries and reset index
    data = data.groupby('user_id').filter(lambda x: len(x) >= 2).reset_index(drop=True)

    # Function to add prefix to some 'job_string'
    def add_prefix(job_string):
        if job_string in ['displaced-reef', 'turtle-danger', 'turtle-danger2']:
            return 'bayou-' + job_string
        elif job_string in ['final-final', 'above-n-below', 'completed']:
            return 'arctic-' + job_string
        else:
            return job_string

    data['job_string'] = data['job_string'].apply(add_prefix)

    return data

# Create a dictionary to count for the number of player who completed and did not complete each job
def create_job_dict(data, levelB):
    job_dict = {}
    for job in data['job_string'].unique():
        job_dict[job] = {'yes': 0, 'no': 0}


    for user, group in data.groupby('user_id'):
        # Find the minimum index for 'LevelB' job
        levelb_index = group[group['job_string'] == LevelB].index.min()

        if pd.notna(levelb_index):
          # Check each job in job_dict
          for job in job_dict:
              # Find rows before levelb_index for this user with the current job
              job_rows = group[(group.index < levelb_index) & (group['job_string'] == job)]

              # Check if any row has 'completed' event_change
              if not job_rows.empty and (job_rows['event_change'] == 'completed').any():
                  job_dict[job]['yes'] += 1
              else:
                  job_dict[job]['no'] += 1

    return job_dict

def add_values_from_para(data, para):
    argumentation_values = []
    modeling_values = []
    experimentation_values = []

    for index, row in data.iterrows():
        job_string = row['job_string']

        # Find the row in 'para' where 'Job' matches 'job_string'
        matching_row = para[para['Job'] == job_string]

        argumentation_values.append(matching_row['Argumentation'].iloc[0] if not matching_row.empty else None)
        modeling_values.append(matching_row['Modeling'].iloc[0] if not matching_row.empty else None)
        experimentation_values.append(matching_row['Experimentation'].iloc[0] if not matching_row.empty else None)

    data['Argumentation'] = argumentation_values
    data['Modeling'] = modeling_values
    data['Experimentation'] = experimentation_values

    return data

def get_level_A_packs(job_dict, threshold_yes, threshold_no):
    # Filter job A base on the threshold
    LevelApacks = []
    for job in job_dict:
        if job_dict[job]['yes'] >= threshold_yes and job_dict[job]['no'] >= threshold_no:
            LevelApacks.append(job)
    return LevelApacks


def process_macro_counts(data, LevelApacks, LevelB):
    macro_user_list_yes = []
    macro_user_list_no = []
    macro_counter_yes_switch = []
    macro_counter_no_switch = []

    for LevelA in LevelApacks:
        count_yes = 0
        user_list_yes = []

        count_no = 0
        user_list_no = []

        for user, group in data.groupby('user_id'):
            levelb_index = group[group['job_string'] == LevelB].index.min()

            if pd.notna(levelb_index):
                if (group['job_string'] == LevelA).any() and (group.index < levelb_index).any():
                    count_yes += 1
                    user_list_yes.append(user)

                if LevelA not in group.loc[group.index < levelb_index, 'job_string'].values:
                    count_no += 1
                    user_list_no.append(user)

        macro_user_list_yes.append(count_yes)
        macro_user_list_no.append(count_no)

        counter_yes_switch = 0

        for user_id in user_list_yes:
            user_group = data[(data['user_id'] == user_id) & (data['job_string'] == LevelB)]
            if 'completed' not in user_group['event_change'].values:
                counter_yes_switch += 1

        macro_counter_yes_switch.append(counter_yes_switch)

        counter_no_switch = 0

        for user_id in user_list_no:
            user_group = data[(data['user_id'] == user_id) & (data['job_string'] == LevelB)]
            if 'completed' not in user_group['event_change'].values:
                counter_no_switch += 1

        macro_counter_no_switch.append(counter_no_switch)

    return macro_user_list_yes, macro_user_list_no, macro_counter_yes_switch, macro_counter_no_switch


In [4]:
LevelB = 'coral-hunting-lions' ###### 【Customizable item】

In [14]:
data = concatenate_csv_files()
path_to_file = 'JobText.txt'
para = diff_parameter(path_to_file)
data = process_progression_data(data)
job_dict = create_job_dict(data, LevelB)
data = add_values_from_para(data, para)
LevelApacks = get_level_A_packs(job_dict, threshold_yes = 30, threshold_no = 30)  ###### 【Customizable item】
macro_user_list_yes, macro_user_list_no, macro_counter_yes_switch, macro_counter_no_switch = process_macro_counts(data, LevelApacks, LevelB)


In [23]:
def process_dataframe(LevelApacks, LevelB, macro_user_list_yes, macro_user_list_no, macro_counter_yes_switch, macro_counter_no_switch, n):

    df = pd.DataFrame(columns=['Level A', 'Level B', 'Played A before B', 'Count', '# Switched B'])

    # Populate the DataFrame
    for i, level_a in enumerate(LevelApacks):
        for j in range(2):
            level_b = LevelB
            played_before_b = 'Y' if j == 0 else 'N'
            count = macro_user_list_yes[i] if j == 0 else macro_user_list_no[i]
            switch_b = macro_counter_yes_switch[i] if j == 0 else macro_counter_no_switch[i]

            new_row = pd.DataFrame({'Level A': [level_a], 'Level B': [level_b], 'Played A before B': [played_before_b], 'Count': [count], '# Switched B': [switch_b]})
            df = pd.concat([df, new_row], ignore_index=True)

    df.reset_index(drop=True, inplace=True)

    df['% Switched B'] = df['# Switched B'] / df['Count']

    df['Difference'] = df.groupby('Level A')['% Switched B'].diff()
    df['Difference'] = df.groupby('Level A')['Difference'].transform(lambda x: x.fillna(x.mean()))

    df = df.sort_values(by=['Difference', 'Level A', 'Played A before B'], ascending=[False, True, False])

    df['% Switched B'] = df['% Switched B'] * 100
    df['% Switched B'] = df['% Switched B'].map('{:.2f}%'.format)

    df['Difference'] = df['Difference'] * 100
    df['Difference'] = df['Difference'].map('{:.2f}%'.format)
    df.insert(5, "# Not Switched B", df['Count'] - df['# Switched B'])

    df_grouped = df.groupby("Level A")[['# Switched B', '# Not Switched B']].transform('sum')
    df['Expected'] = ((df['# Switched B'] + df['# Not Switched B']) * df_grouped['# Switched B']) / (df_grouped['# Switched B'] + df_grouped['# Not Switched B'])

    unique_levels = df['Level A'].unique()
    chi_stats = []
    p_values = []

    for level in unique_levels:
        subset = df[df['Level A'] == level]
        observed = subset['# Switched B'].tolist()
        expected = subset['Expected'].tolist()

        observed_forchi2 = subset[['# Switched B', '# Not Switched B']].copy()

        # Perform chi-square test
        chi2, p, _, _ = chi2_contingency(observed_forchi2)
        chi_stats.append(chi2)
        p_values.append(p)

    result_df = pd.DataFrame({'Level A': unique_levels, 'Chi-Square Statistic': chi_stats, 'p-value': p_values})
    df = df.merge(result_df, on='Level A', how='left')


    ######### Modify the DataFrame to keep only the first n*2 rows #########
    df = df.iloc[:n*2]

    ######### Extract and return the first n "Level A" values #########
    level_a_first_n = df['Level A'].iloc[range(0, n*2, 2)].unique().tolist()
    return df, level_a_first_n

# Example of calling the function with n=3
result_dataframe, level_a_first_n = process_dataframe(LevelApacks, LevelB, macro_user_list_yes, macro_user_list_no, macro_counter_yes_switch, macro_counter_no_switch, n=3)
result_dataframe



Unnamed: 0,Level A,Level B,Played A before B,Count,# Switched B,# Not Switched B,% Switched B,Difference,Expected,Chi-Square Statistic,p-value
0,bayou-reef-decision,coral-hunting-lions,Y,124,20,104,16.13%,26.14%,44.1399176955,26.3924697082,2.786e-07
1,bayou-reef-decision,coral-hunting-lions,N,362,153,209,42.27%,26.14%,128.8600823045,26.3924697082,2.786e-07
2,bayou-hide-n-seek,coral-hunting-lions,Y,148,35,113,23.65%,18.40%,54.168,14.4142107178,0.0001466912
3,bayou-hide-n-seek,coral-hunting-lions,N,352,148,204,42.05%,18.40%,128.832,14.4142107178,0.0001466912
4,bayou-methanogen,coral-hunting-lions,Y,162,41,121,25.31%,15.97%,58.5889328063,11.4856179372,0.000701368
5,bayou-methanogen,coral-hunting-lions,N,344,142,202,41.28%,15.97%,124.4110671937,11.4856179372,0.000701368


# III. Construct regression table

- This section only consider the n number (defined in Session II) of previous jobs prior to the target job with the highest chi-square coefficient to compile the regression table.

In [25]:
def process_levels(data, LevelApacks, LevelB, num_check):
    num_levelA = 1

    for LevelA in LevelApacks:
        if num_levelA > num_check:
            break

        if LevelA == "kelp-shop-welcome":
            continue

        print("**--------------------------------", LevelA, "--------------------------------**")

        x = pd.DataFrame(columns=['user_id', 'Level', 'Complete_A', 'Complete_B',
                                  'Arg_above_2', 'Mod_above_2', 'Exp_above_2',
                                  'Time', 'Job', 'Task', 'Biome', 'Session'])

        # Filter user IDs for LevelB
        user_ids_levelb = data[data['job_string'] == LevelB]['user_id'].unique()

        # Iterate over each user_id
        for user_id in user_ids_levelb:
            user_data = data[data['user_id'] == user_id]


            #【Feature 1-2: Complete_A, Complete_B】: Calculate Complete_A and Complete_B
            complete_a = 1 if any((user_data['job_string'] == LevelA) & (user_data['event_change'] == 'completed')) else 0
            complete_b = 1 if any((user_data['job_string'] == LevelB) & (user_data['event_change'] == 'completed')) else 0

            #【Feature 3-5: unique_job_arg_count, unique_job_mod_count, unique_job_exp_count】: Count the number of jobs [completed] where each difficulty parameter > 2
            # Select rows up to and including the first 'LevelB' occurrence
            levelb_index = user_data[user_data['job_string'] == LevelB].index.min()
            user_data_levelb = user_data.loc[:levelb_index]

            filtered_df = user_data_levelb[user_data_levelb['event_change'] == 'completed']   # Which jobs has the student completed so far?
                                                                                              # Please change 'filtered_df' to 'user_data_levelb' if we want to include any job student has accepted

            unique_job_arg_count = filtered_df[filtered_df['Argumentation'] > 2]['job_string'].nunique()
            unique_job_mod_count = filtered_df[filtered_df['Modeling'] > 2]['job_string'].nunique()
            unique_job_exp_count = filtered_df[filtered_df['Experimentation'] > 2]['job_string'].nunique()

            #【Feature 6: time_diff】: Time spent so far in hours
            time_diff  = user_data['time_diff'].sum() / 3600

            #【Feature 7-8: job_sum, task_sum】: Sum Job and Task student has completed
            job_sum = user_data_levelb['job'].sum()
            task_sum = user_data_levelb['task'].sum()

            #【Feature 9: Biome】: Number of Biome Student has been to
            biome = user_data_levelb['job_string'].str.split('-').str[0].nunique()

            #【Feature 10: time_diff】: Number of time paused (unique session id) so far
            session  = user_data['session'].sum() - 1


            # Append to DataFrame x
            new_row = {'user_id': user_id, 'Level': LevelA, 'Complete_A': complete_a, 'Complete_B': complete_b,
                      'Arg_above_2': unique_job_arg_count, 'Mod_above_2': unique_job_mod_count, 'Exp_above_2': unique_job_exp_count,
                      'Time': time_diff, 'Job': job_sum, 'Task': task_sum, 'Biome': biome, 'Session': session}


            x = pd.concat([x, pd.DataFrame([new_row])], ignore_index=True)

        # Drop rows where any NaN values exist due to incomplete log file
        x = x.dropna()

        # If Biome is the same all the time, drop it
        if x['Biome'].nunique() == 1:
          x = x.drop_duplicates(subset='Biome', keep=False)

        # Selecting the independent variables (excluding 'user_id' and 'Complete_B')
        X = x.drop(['user_id', 'Complete_B', 'Level'], axis=1)

        # Adding a constant to the model (intercept)
        X = sm.add_constant(X)

        # The dependent variable
        y = x['Complete_B']

        # Fit the regression model
        model = sm.Logit(y.astype(float), X.astype(float))
        result = model.fit()

        # Print the summary of the regression
        print(result.summary())

        # Print the p-value for 'Complete_A'
        # p_value_complete_a = result.pvalues['Complete_A']
        # print(f"P-value for {LevelA}: {p_value_complete_a}")

        num_levelA += 1

LevelApacks = level_a_first_n
process_levels(data, LevelApacks, LevelB, num_check = 300)  ###### 【Customizable item】


**-------------------------------- bayou-reef-decision --------------------------------**
Optimization terminated successfully.
         Current function value: 0.540631
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:             Complete_B   No. Observations:                  397
Model:                          Logit   Df Residuals:                      387
Method:                           MLE   Df Model:                            9
Date:                Thu, 11 Apr 2024   Pseudo R-squ.:                  0.1969
Time:                        19:31:19   Log-Likelihood:                -214.63
converged:                       True   LL-Null:                       -267.27
Covariance Type:            nonrobust   LLR p-value:                 1.345e-18
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -