<a href="https://colab.research.google.com/github/tduong191/DataStructure_Python/blob/main/classify_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import string
import re
import numpy as np

In [None]:
# get authentication from google drive
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
# open the google sheet
sheet = gc.open('classify-error').sheet1

# get_all_values gives a list of rows.
rows = sheet.get_all_values()
df = pd.DataFrame.from_records(rows)
df.columns = df.iloc[0]
df = df.drop(0)

In [None]:
# drop rows that haven't been classified
from numpy import nan
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset = ['error_label_1'], inplace=True)


In [None]:
"""
Compute the Damerau-Levenshtein distance between two given
strings (s1 and s2)
"""
def damerau_levenshtein_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition

    return d[lenstr1-1,lenstr2-1]/((lenstr1+lenstr2)/2)

In [None]:
"""
Check if the two words are similar at the 0.4 threshold
"""
def is_similar(s1, s2):
  if damerau_levenshtein_distance(s1,s2) > 0.4:
    return True
  return False

In [None]:
"""
Remove punctuation 
"""
import re
def remove_punc(strlist):
  result = []
  for word in strlist:
    res = ''.join(e for e in word if e.isalnum())
    result.append(res)
  return result

Data Processing

In [None]:
# transform string data type to list
df['story_text'] = df.apply(lambda x: list(x.story_text.split(",")), axis = 1)
df['transcript'] = df.apply(lambda x: list(x.transcript.split(",")), axis = 1)
df['aligned_text'] = df.apply(lambda x: list(x.aligned_text.split(",")), axis = 1)
df['aligned_transcript'] = df.apply(lambda x: list(x.aligned_transcript.split(",")), axis = 1)
df['error_index'] = df.apply(lambda x: list(x.error_index.split(",")), axis = 1)


In [None]:
# remove punctuations in list
df['story_text'] = df.apply(lambda x: remove_punc(x.story_text), axis = 1)
df['transcript'] = df.apply(lambda x: remove_punc(x.transcript), axis = 1)
df['aligned_text'] = df.apply(lambda x: remove_punc(x.aligned_text), axis = 1)
df['aligned_transcript'] = df.apply(lambda x: remove_punc(x.aligned_transcript), axis = 1)
df['error_index'] = df.apply(lambda x: remove_punc(x.error_index), axis = 1)

In [None]:
"""
function to get the words that are incorrect in the transcript based on the index. Return index, the word that is correct, the word that is incorrectly said
"""
def get_error_word(aligned_text, aligned_transcript, error_index):
  result = []
  for index in error_index:
    comp = []
    comp.append(index)
    comp.append(aligned_text[index])
    comp.append(aligned_transcript[index])
    result.append(comp)
  return result

In [None]:
def to_int(index):
  result = []
  for i in index:
    i = int(i)
    result.append(i)
  return result


In [None]:
df['error_index'] =  df.apply(lambda x: to_int(x.error_index), axis = 1)
df['error_words'] = df.apply(lambda x: get_error_word(x.aligned_text, x.aligned_transcript, x.error_index), axis = 1)

Classification Algorithm


In [None]:
def classify(error_words, story_text, transcript):
  result = []
  for error in error_words:
      text = error[1] 
      trans = error[2]
      res = []

      # if the text is empty
      if text == '':
        if trans in story_text and trans in transcript: # student already said the correct word in transcript
          e = 'correct - repetition'
          id = story_text.index(trans)
        elif trans in story_text and trans not in transcript: # student made a close attempt before this word and got this word correctly
          e = 'correct - self-correction'
          id = story_text.index(trans)
        else: # student said something unrelated and cannot be found in the text
          e = 'miscue - unrelated chatter'
          id = error[0]
      
      # if the transcript is empty -> student did not make an attempt at saying the word -> skip
      elif trans == '':
        e = 'miscue - skip'
        id = story_text.index(text)
      
      # if the text and transcript words are aligned
      else:
        if trans in story_text: # if the word is already said -> self-repetition
          e = 'correct - repetition'
          id = story_text.index(trans)
        else:
          if is_similar(text, trans): # if the text and transcript sound/look similar enough
            if text in transcript: # and if the correct word is also said in the transcript -> self-correction
              e = 'correct - self-correction'
              id = story_text.index(text)
            else: # if the correct word was not said -> student made an attempt at saying something similar -> substition
              e = 'miscue - substitution'
              id = error[0]
          else: # if the aligned text and  aligned transcript do not look/sound similar -> no attempt was made -> unrelated chatter
            e = 'miscue - unrelated chatter'
            id = error[0]

      
      res.append(id)
      res.append(e)
      result.append(res)
    
  return result




In [None]:
df['classify'] = df.apply(lambda x: classify(x.error_words, x.story_text, x.transcript), axis = 1)

In [None]:
df[['story_text', 'transcript', 'classify']]

Unnamed: 0,story_text,transcript,classify
1,"[no, it, wasnt, there]","[no, it, was, there]","[[2, miscue - substitution]]"
2,"[baby, bunny, is, a, boy]","[baby, bunny, a, boy]","[[2, miscue - skip]]"
3,"[i, love, bugs]","[i, i]","[[1, miscue - skip], [0, correct - repetition]]"
4,"[the, mouse, ran, down]","[the, mouse, down]","[[2, miscue - skip]]"
5,"[i, need, dad]","[dad, dad]","[[0, miscue - skip], [2, correct - repetition]]"
...,...,...,...
563,"[have, this, wish, i, wish, tonight]","[have, that, have, this, wish, i, wish, tonight]","[[0, correct - repetition], [1, miscue - unrel..."
564,[boot],"[boot, boot]","[[0, correct - repetition]]"
565,"[do, you, know, the, muffin, man]","[do, you, know, the, muffin, the, muffin]","[[3, correct - repetition], [4, correct - repe..."
566,[bird],"[bird, bird]","[[0, correct - repetition]]"


Test Model Accuracy


In [None]:
label = df.columns[df.columns.str.startswith('error_label_')]
#df.drop(unwanted, axis=1, inplace=True)

In [None]:
split_df = pd.DataFrame(df['classify'].tolist(), columns=['e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7'])

In [None]:
split_df

Unnamed: 0,e1,e2,e3,e4,e5,e6,e7
0,miscue - substitution,,,,,,
1,miscue - skip,,,,,,
2,miscue - skip,miscue - substitution,,,,,
3,miscue - skip,,,,,,
4,miscue - skip,miscue - substitution,,,,,
...,...,...,...,...,...,...,...
472,miscue - unrelated chatter,miscue - unrelated chatter,,,,,
473,miscue - unrelated chatter,,,,,,
474,miscue - unrelated chatter,miscue - substitution,,,,,
475,miscue - unrelated chatter,,,,,,


In [None]:
df_label = df[label]

In [None]:
df_label

Unnamed: 0,error_label_1,error_label_2,error_label_3,error_label_4,error_label_5,error_label_6,error_label_7
1,miscue - substitution,,,,,,
2,correct - repetition,,,,,,
3,miscue - substitution,,,,,,
4,miscue - skip,,,,,,
5,miscue - skip,miscue - substitution,,,,,
...,...,...,...,...,...,...,...
563,miscue - substitution,,,,,,
564,correct - repetition,,,,,,
565,miscue - substitution,correct - repetition,,,,,
566,correct - repetition,,,,,,
