# 1. import data and packages

In [None]:
# import neccessary packages
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import glob
import os
import re
import random
import itertools
import collections
from collections import Counter
import regex as re
import nltk
from nltk.corpus import stopwords
from scipy.ndimage.filters import gaussian_filter1d
import seaborn as sns

## 1.2 File import
### You should have run SILLA first, and have the cleaned file ready

In [None]:
df_ms = pd.read_pickle("your_processed_file_from_silla")
# only need these columns
ms_df = df_ms[['Speaker', 'Text', 'group','group_turn_id','group_speaker_turn_id']]
ms_df['w_target'] = ms_df['Text']
ms_df

# 2. Text preprocessing, get token list of words for each utterance 
lowercase, tokenize, punc, stopwords
## 2.1 Convert text to lowercase
Will convert text to lowercase so there is no issue with word comparisons.

In [None]:
for i,j in enumerate(ms_df['w_target']):
    ms_df['w_target'][i] = str(j).lower()

print(ms_df['w_target'][-3:])

Remove the text in [], which includes things like "[Student calls teacher to look at their assignment.]"

In [None]:
for i,j in enumerate(ms_df['w_target']):
    ms_df['Text'][i] = re.sub("[\(\[].*?[\)\]]", '', str(j))

# print(ms_df['w_target'][-10:])

## 2.2 Tokenize and Remove punctuation

In [None]:
for i, j in enumerate(ms_df['w_target']):
    # print('j:', j)
    words = nltk.word_tokenize(str(j))
    ms_df['w_target'][i] = [word for word in words if word.isalnum()]
    # print("after:",ms_df['Text'][i])

In [None]:
print(ms_df['w_target'][-10:])

In [None]:
# ms_df.to_pickle('Middle_School/before_stop_remove')

## 2.3 Remove Stop Words
Be careful about removing stop words, because it will also remove some programming-related words like `if`, `for`, and `and`. If these words are important for your analysis, then skip this step

In [None]:
stopword = stopwords.words('english')
for i, j in enumerate(ms_df['Text']):
    # print(j)
    ms_df['w_target'][i] = [word for word in j if word not in stopword]
    # print(ms_df['Text'][i])

In [None]:
print(ms_df['w_target'][-10:])

# 3. Set up target and prime

## 3.1 target = word list of current utterance
This has been setted up previously, the `w_target`

## 3.2 get prime, previous n utterance from the other speaker (window size customizable)

In [None]:
# set get_prime_w_list to be n turns prior to the target turn 
def get_prime_w_list(df_ms, n): # specify the number of turns  as n, return a dict
    prime_dict = {}
    for group,turn_id in zip(df_ms.group, df_ms.group_turn_id):
        if turn_id == 1: # when it moves to a new group, store the group_name
            group_name = group
        #  get the index of the row 
        index1 = df_ms[(df_ms['group'] == group_name) & (df_ms['group_turn_id'] == turn_id)].index.item()
        
        if group == group_name: # prime has to be done within the same group
            # if index < n, then do everything from 1 to index
            prime_count = turn_id//2 # the number of rows priming: turn_id//2. 5//2 = 2
            speaker = df_ms.iloc[index1]['Speaker'] # find the value in column 'Speaker' and row 'index1'
            word_list = []
            if prime_count < n: 
                word_list = df_ms[(df_ms['group'] == group_name) & 
                                    (df_ms['Speaker'] != speaker)].iloc[0: prime_count]['w_target'].tolist()
            else: 
                word_list = df_ms[(df_ms['group'] == group_name) & 
                                    (df_ms['Speaker'] != speaker)].iloc[prime_count-n: prime_count]['w_target'].tolist()
            word_list_reformat = []
            for l in word_list: 
                word_list_reformat.extend(l) # add it to the reformatted list      

            prime_dict[index1] = word_list_reformat # syntax list for prime                 
    # print(list(prime_dict.items())[:2])
    return prime_dict

In [None]:
# add the prime subtree column (10-turn window)
ms_df['w_prime_10'] = ms_df.index.map(get_prime_w_list(ms_df, 10)) 
# replace the first row prime NaN to [] so it can be treated universally
ms_df['w_prime_10'] = [ [] if x is np.NaN else x for x in ms_df['w_prime_10'] ] 

In [None]:
# add the prime subtree column (5-turn window)
ms_df['w_prime_5'] = ms_df.index.map(get_prime_w_list(ms_df, 5)) 
# replace the first row prime NaN to [] so it can be treated universally
ms_df['w_prime_5'] = [ [] if x is np.NaN else x for x in ms_df['w_prime_5'] ] 
# df_ms.head()

## 3.3 Calculate the LILLA

In [None]:
# find the overlap items in two lists (target and prime)
# calculate the length of utterances
def find_overlap(df_ms, prime_window):

    prime_column = 'w_prime_'+ str(prime_window) # get the column name based on the prime_window chosed 
    len_prime_column = 'len_prime_' + str(prime_window)
    
    print(prime_column)
    df_ms['overlap_count'] = 0
    df_ms['len_target'] = 0
    df_ms['len_prime'] = 0

    for index, row in df_ms.iterrows():
        n=0
        list1 = row[prime_column]
        list2 = row['w_target']
        # print(type(list1), type(list2))
        for i in list1:
            
            if i in list2: # if the item in list1 belongs to list 2, then n++
                n+=1
        # add columns (column manipulation: set use df.at, get use df.loc)
        
        df_ms.at[index,'overlap_count'] = n # set number of items that overlapped between prime and target
        df_ms.at[index,'len_target'] = len(list2) # set length of target utterance
        df_ms.at[index,'len_prime'] = len(list1) # set length of prime utterance
    
    return len_prime_column # keep the len_prime column name for future use 

In [None]:
# specify your prime window, currently is 10 
find_overlap(ms_df, 10)
ms_df[18:25]

In [None]:
# get the SILLA = p(target|prime)/p(target) = number of elements overlapped / (len_target*len_prime)
ms_df['len_prime_target'] = ms_df['len_target'] * ms_df['len_prime']
ms_df['lilla'] = ms_df['overlap_count']/ms_df['len_prime_target']

In [None]:
# df_ms.to_csv(r'middle_school_SILLA_uncleaned.csv')

In [None]:
# df_ms.to_csv(r'test_syntax.csv')

## 3.4 Calculate Normalized LLA (nLLA)

### 3.4.1 compute $\bar {LLA}$
i.e., The average LLA for all pairs that have the same product of length, and for all possible product values n.
Outcome of D1 should be a dictionary which the key to be product values (n), value to be the average LLA of all pairs which have that product values

In [None]:
# Reitter paper idea: normalize by the average LLA for the same product of length, which is len(Prime)*len(target)
# "product of length" column: len_prime_target
grouped_element_length = ms_df.groupby('len_prime_target') # group the df by utternace length 
print(len(grouped_element_length)) # total of 1657 unique len(Prime)*len(target) for ms, 946 for ug
# 1539 unique len(Prime)*len(target) on lexicon level for ms

avg_lla_list = {} # key = length, value = average_lla, use this to add the column to df later
for l in grouped_element_length: # l is a tuple object, l[0] is the element length, l[1] is the subset dataframe
    avg_lla = l[1]["lilla"].mean()
    avg_lla_list[l[0]] = avg_lla
print(list(avg_lla_list.items())[:5])   

In [None]:
# add avg_lla dictionary to the df as a column, mapping by len(Prime)*len(target)
ms_df['avg_lilla'] = ms_df.len_prime_target.map(avg_lla_list)
ms_df.head()

### 3.4.2 compute nLLA

In [None]:
# get the nLLA = LLA / avg_lla
ms_df['nlilla'] = ms_df['lilla']/ms_df['avg_lilla'] 
ms_df

In [None]:
ms_df.to_pickle("./LILLA_scores.pkl") # change the file name if you want

# 4. Visual inspection 

In [None]:
# load the data
df_silla = pd.read_pickle("../data/2_middle school silla/ms_7812_nsilla_0414_dist10.pkl")
df_lilla = pd.read_pickle("../data/6_lilla/ms_7812_lilla_dist10.pkl")

In [None]:
np.count_nonzero(ms_df['overlap_count'], axis=0)

## 4.1 Distribution of SILLA

In [None]:
ms_df = ms_df.replace([np.inf, -np.inf], 0)

In [None]:
# plot the distribution of SILLA scores
plt.hist(ms_df['lilla'], bins = 400)
plt.xlim(-0.01, 0.1)
plt.title('LILLA distribution - ms')
plt.show()

In [None]:
# remove zero
plt.hist(ms_df[ms_df['lilla'] != 0]['lilla'], bins = 400)
plt.xlim(-0.01, 0.15)
plt.title('LILLA distribution - ms - nonzero only')
plt.show()

In [None]:
# count percentage of zero values of silla
ms_df['lilla'].value_counts(normalize=True)

## 4.2 Distribution of Normalized LLA (nLLA)
The distribution shape changed a bit, the scale changed, general trends look similar.

In [None]:
# plot the distribution of normalized SILLA scores
plt.hist(ms_df['nlilla'], bins = 120)
plt.xlim(-0.2, 8)
plt.title('nLILLA distribution - ms')
plt.show()

In [None]:
# plot the distribution of normalized SILLA scores
plt.hist(ms_df[ms_df['nlilla'] != 0]['nlilla'], bins = 120)
plt.xlim(-0.2, 8)
plt.title('nLILLA distribution - ms - nonzero only')
plt.show()

## 4.3 Distribution of LLA by groups

### 4.3.1  Distribution of LILLA by groups

In [None]:
grouped = ms_df.groupby('group')
# print(list(grouped))

for group in grouped:
  # figure()
  print(group[0])
  group[1].lilla.plot.hist(bins = 200, xlim = (-0.01, 0.1), figsize=(3,3))
  # plot.hist(group[1].N)
  plt.show()

### 4.3.2  Distribution of nLLA by groups

In [None]:
for group in grouped:
  # figure()
  print(group[0])
  group[1].nlla.plot.hist(bins = 80, xlim = (-0.5, 6), figsize=(3,3))
  # plot.hist(group[1].N)
  plt.show()

## 4.4 distribution of sentence length, relationship between sentence length and LLA 

In [None]:
# plot the distribution of sentence length
plt.hist(ms_df['len_target'], bins = 200)
plt.xlim(-1, 80)
plt.show()

In [None]:
# any patterns between sentence length and SILLA score
x = ms_df['len_target']
y = ms_df['lilla']
plt.scatter(x,y)

In [None]:
# any patterns between sentence length and overlap count
plt.scatter(ms_df['len_target'],ms_df['overlap_count'])
m, b = np.polyfit(ms_df['len_target'], ms_df['overlap_count'], 1)
plt.plot(ms_df['len_target'], m*ms_df['len_target'] + b,color = 'green')
print('slop: ', m, 'intercept: ', b)

In [None]:
# take a granular look 
plt.scatter(ms_df['len_target'],ms_df['overlap_count'])
plt.xlim(0, 80)
plt.ylim(-1, 120)

## 4.5 Distribution of words

In [None]:
# plot the distribution of syntax subtrees

# Create dictionary
dict_freq = {}
# Add syntax rules (subtrees) to dictionary
for index, row in ms_df.iterrows():
    list3 = row['w_target'] # list3 is subtress in one utterance
    # print(list3)
    # list is unhashable, convert list to tuple
    for r in list3:   # r is each subtree
        if r not in dict_freq:
            dict_freq[r] = 0
        dict_freq[r] += 1
word_freq_list = [(v,k) for k,v in dict_freq.items()]
freq_list_sorted = sorted(word_freq_list,reverse=True)
print(freq_list_sorted)

In [None]:
from nltk.book import *

In [None]:
# take a look at the syntax subtree frequency dictionary 
print('unique words: ',len(dict_freq))
print('total word freq (for all utterances):', sum(dict_freq.values()))

In [None]:
fdist = FreqDist(dict_freq)

# print the top most common 50 words, using `.most_common(50)`
# [Insert code here]
fdist.most_common(50)

In [None]:
fdist.plot(40)

# 5. Merge SILLA and LILLA

In [None]:
# load the data
df_silla = pd.read_pickle("your silla file")
df_lilla = pd.read_pickle("your lilla file")

In [None]:
df_lilla['w_len_target'] = df_lilla['len_target']
df_lilla = df_lilla[['w_target', 'lilla', 'nlilla', 'w_len_target']]
df_lilla[:2]

In [None]:
df_silla['syn_len_target'] = df_silla['len_target']
df_silla['silla'] = df_silla['lla']
df_silla['nsilla'] = df_silla['nlla']
df_silla['group_speaker'] = df_silla['group'] + df_silla['Speaker']
df_silla = df_silla[['Text', 'group', 'group_speaker', 'group_turn_id', 'syntax_tree_current', 'silla', 'nsilla', 'syn_len_target']]
df_silla[:2]

In [None]:
df_merge = pd.merge(df_silla, df_lilla, left_index=True, right_index=True)
df_merge

In [None]:
# add the ratio 
df_merge['silla_lilla'] = df_merge['silla']/df_merge['lilla']
df_merge['nsilla_nlilla'] = df_merge['nsilla']/df_merge['nlilla']

In [None]:
# output for other analysis
df_merge.to_pickle("./ms_merged_silla_lilla.pkl")
df_merge.to_csv(r'ms_merged_silla_lilla.csv')

## plot silla and lilla

In [None]:
df_ms = pd.read_pickle("ms_merged_silla_lilla.pkl")

In [None]:
df_ms[['silla','lilla','nlilla','w_len_target']].describe()

In [None]:
plt.scatter(df_merge['silla'],df_merge['lilla'])
plt.xlim(-0.01, 0.2)
plt.ylim(-0.01, 0.2)

In [None]:
# deal with NaN values
grp = df_ms
grp['lilla'].isna().sum() 
# replace NaN values to the median (1)
grp['lilla'] = grp['lilla'].replace(np.nan, 0.01046)#  0.01046 for ms lilla
# df_ms['nlla'].isna().sum()

In [None]:
y=grp[['lilla']].to_numpy().flatten()
ysmoothed = gaussian_filter1d(y, sigma=10)
sns.lineplot(data=grp, x='group_turn_id', y=ysmoothed)
plt.xlim(-10, 250)
# plt.ylim(0.018, 0.033)
plt.title('MS lilla over time for all groups')
plt.show()

In [None]:
# plot the distribution of ratio silla/lilla
# for cases that lilla == 0, replace by -1
ms_df = df_merge.replace([np.inf, -np.inf], -1)
# plot
plt.hist(ms_df['silla_lilla'], bins = 200)
plt.xlim(-2, 15)
plt.show()

In [None]:
# after replacing the inf values to -1, here is the descr stats
ms_df['silla_lilla'].describe()