# Scoring


In [71]:
import pandas as pd
import re
df = pd.read_csv("result_df_scores.csv") 
df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [72]:
df 

Unnamed: 0,expected_output,actual_output,scores
0,"\n\nA: I cut until this length ok?\nB: Yeah, o...","\n\n\nA: I cut until this length ok?\nB: Yeah,...",0.0
1,\nA: Bank robbery leh51,"\n\nA: Bank robbery leh33, and you dare to tac...",0.0
2,\n\nA: So where are we meeting later?\nB: we s...,\n\n\nA: So where are we meeting later?\nB: we...,1.0
3,\n\nA: Did you enjoy your weekend?\nB: I had t...,\n\n\nA: Did you enjoy your weekend?\nB: I had...,1.0
4,"\n\nA: Hey, where are you right now?\nB: I'm a...","\n\n\nA: Hey, where are you right now?\nB: I'm...",1.0
5,\n\nA: What are you shopping for?\nB: Watch lo...,\n\n\nA: What are you shopping for?\nB: Watch ...,0.5
6,"\n\nA: Huh, are you still doing yesterday's tu...","\n\n\nA: Huh, are you still doing yesterday's ...",0.5
7,"\n\nA: Hey, did you tell Shuhui about the surp...","\n\n\nA: Hey, did you tell Shuhui about the su...",0.5
8,"\n\nA: Hey, have you been to Sarah's house? \n...","\n\n\nA: Hey, have you been to Sarah's house? ...",0.5
9,"\n\nA: Hey, are you coming to the party tonigh...","\n\n\nA: Hey, are you coming to the party toni...",0.0


In [73]:
def extract_particles(text):
  LIST_OF_PARTICLES = [r'lah\d{2}', r'lor\d{2}', r'leh\d{2}']
  pattern = r"\b" + "|".join(LIST_OF_PARTICLES)  + r"\b"
  return re.findall(pattern, text)

# TEST
extract_particles("walao lah21 leh33, you lah21, ofc you lor33")

['lah21', 'leh33', 'lah21', 'lor33']

In [74]:
df['ground_truth'] = df['expected_output'].apply(lambda x: extract_particles(x))

In [75]:
df['ground_truth_num_particles'] = df['ground_truth'].apply(lambda x: len(x))

In [76]:
df['actual_output_particles'] = df['actual_output'].apply(lambda x: extract_particles(x))

In [77]:
df.to_excel('result_df_scores.xlsx')

### Dataset Expected Output EDA

In [78]:
distribution_table = df['ground_truth_num_particles'].value_counts().reset_index()
distribution_table

Unnamed: 0,ground_truth_num_particles,count
0,1,36
1,2,20
2,3,3
3,4,1


In [79]:
from collections import Counter
def get_particle_counts(colname, df):
    flattened_list = df[colname].explode().tolist() 
    distribution = Counter(flattened_list)
    distribution_df = pd.DataFrame(list(distribution.items()), columns=['Element', 'Count'])
    distribution_df = distribution_df.sort_values(by='Count', ascending=False)
    return distribution_df
get_particle_counts('ground_truth', df)

Unnamed: 0,Element,Count
1,lah21,29
5,lah51,20
3,leh33,19
2,lor33,10
4,leh21,5
0,leh51,4
6,lah24,2


### Dataset Actual Output EDA

In [80]:
distribution_table = df['scores'].value_counts().reset_index()
distribution_table


Unnamed: 0,scores,count
0,1.0,31
1,0.0,16
2,0.5,10
3,0.75,1
4,0.667,1
5,0.333,1


In [81]:
print(f"TOTAL ACCURACY out of {len(df)}")
sum(distribution_table['count']*distribution_table['scores'])

TOTAL ACCURACY out of 60


37.75

In [82]:
distribution_table = df[['scores', 'ground_truth_num_particles']].value_counts().reset_index()
distribution_table

Unnamed: 0,scores,ground_truth_num_particles,count
0,1.0,1,20
1,0.0,1,16
2,0.5,2,10
3,1.0,2,10
4,0.333,3,1
5,0.667,3,1
6,0.75,4,1
7,1.0,3,1


In [83]:
get_particle_counts('actual_output_particles', df)

Unnamed: 0,Element,Count
1,lah21,51
0,leh33,25
2,lor33,11
3,leh21,3


In [84]:
df_incorrect = df[df['scores'] != 1.0]
df_incorrect

Unnamed: 0,expected_output,actual_output,scores,ground_truth,ground_truth_num_particles,actual_output_particles
0,"\n\nA: I cut until this length ok?\nB: Yeah, o...","\n\n\nA: I cut until this length ok?\nB: Yeah,...",0.0,[leh51],1,[leh33]
1,\nA: Bank robbery leh51,"\n\nA: Bank robbery leh33, and you dare to tac...",0.0,[leh51],1,[leh33]
5,\n\nA: What are you shopping for?\nB: Watch lo...,\n\n\nA: What are you shopping for?\nB: Watch ...,0.5,"[lor33, leh21]",2,"[lor33, leh33]"
6,"\n\nA: Huh, are you still doing yesterday's tu...","\n\n\nA: Huh, are you still doing yesterday's ...",0.5,"[leh21, lah21]",2,"[leh33, lah21]"
7,"\n\nA: Hey, did you tell Shuhui about the surp...","\n\n\nA: Hey, did you tell Shuhui about the su...",0.5,"[leh21, leh33]",2,"[leh33, leh33]"
8,"\n\nA: Hey, have you been to Sarah's house? \n...","\n\n\nA: Hey, have you been to Sarah's house? ...",0.5,"[leh21, leh33]",2,"[leh33, leh21]"
9,"\n\nA: Hey, are you coming to the party tonigh...","\n\n\nA: Hey, are you coming to the party toni...",0.0,[leh51],1,[leh33]
10,\n\nA: Photographic society has a meeting on W...,\n\n\nA: Photographic society has a meeting on...,0.5,"[lah51, leh33]",2,"[lah21, leh33]"
15,"\n\nA: There's nothing to eat at home\nB: Oh, ...",\n\n\nA: There's nothing to eat at home\nB: Oh...,0.0,[lah51],1,[lah21]
17,"\n\nA: Hey, are you excited for university ori...","\n\n\nA: Hey, are you excited for university o...",0.0,[lah24],1,[lah21]


In [92]:
# Function to compare elements in each cell and count the occurrences
def compare_and_count(row, col):
    count = Counter(row[col])
    return pd.Series(count)

distribution_df = df.apply(lambda row: compare_and_count(row, col='ground_truth'), axis=1).fillna(0)
distribution_df

Unnamed: 0,lah21,lah24,lah51,leh21,leh33,leh51,lor33
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8,0.0,0.0,0.0,1.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
distribution_df = df.apply(lambda row: compare_and_count(row, col='actual_output_particle'), axis=1).fillna(0)
distribution_df