Summary:

1. Create role ids for teachers and parents
2. Create text chain ids (i.e., an ID for teacher-parent pair)
3. Fix date-time weirdness
4. Manual language correction for "Grasias"
5. Create lead/lag messages to see the replies

In [None]:
## data visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np

from datetime import datetime, date
import dateutil.parser

In [None]:
msg_data = pd.read_pickle('../data/analysis_data/msgs_wdem_wtrans_1203.pkl')
msg_data.shape

# 1. Create role ids for teachers and parents

## 1.1. For teachers and administrators

In [None]:
## first, role ids for teachers and admin (from old script 8)
role_id_schoolsent = msg_data.loc[msg_data.broad_type == "school_sent",
                                 ['sender_full_name', 'role']].drop_duplicates()

teacher_id = role_id_schoolsent[role_id_schoolsent.role == 'Teacher'].copy()
teacher_id['role_id'] = ['teacher_id_' + str(i) for i in np.arange(1, (len(teacher_id)+1))]
admin_id = role_id_schoolsent[role_id_schoolsent.role == 'Administrator'].copy()
admin_id['role_id'] = ['admin_id_' + str(i) for i in np.arange(1, (len(admin_id)+1))]

# rowbind the teacher and admin ids together
role_id_combined = teacher_id.append(admin_id, ignore_index = True)

# check duplicates-- see that two classified as both teachers and administrators
school_sent_multiple_roles = role_id_combined[role_id_combined.sender_full_name\
                                             .duplicated(keep=False)].copy()
#school_sent_multiple_roles

# Remove duplicates from our dataset
multiple_role_list = school_sent_multiple_roles['sender_full_name'].drop_duplicates().tolist()
role_id_combined_tomerge = role_id_combined[~((role_id_combined.sender_full_name.isin(multiple_role_list)) & 
                                             (role_id_combined.role == 'Teacher'))].copy()

role_id_combined.shape
role_id_combined_tomerge.shape


# in messaging data, reassign the role to be "Administrator" 
# if the sender name is in the multple role list 
msg_data['role'] = np.where(msg_data.sender_full_name.isin(multiple_role_list), 
                           'Administrator', 
                            msg_data.role)

## 1.2. For parents

In [None]:
## might want to add school as well?
all_sending_parents = msg_data.loc[msg_data.broad_type == "parent_sent",
                     ['sender_full_name', 'StudentID', 'school_merge']]\
                     .rename(columns = {'sender_full_name': 'parent_full_name'}).drop_duplicates()

all_receiving_parents = msg_data.loc[msg_data.broad_type == "school_sent",
                        ['receiver_full_name', 'StudentID', 'school_merge']]\
                        .rename(columns = {'receiver_full_name': 'parent_full_name'}).drop_duplicates()

all_parents_togiveids = pd.concat([all_sending_parents,
                                 all_receiving_parents]).drop_duplicates()

print("There are {} students with {} parent names".format(len(all_parents_togiveids.StudentID.unique()),
                                                    all_parents_togiveids.shape[0]))

all_parents_togiveids['role_id'] = ['parent_id_' + str(i) for i in np.arange(1, len(all_parents_togiveids)+1)]

#all_parents_togiveids.head()

In [None]:
#ids_to_rm = ['teacher_id_100', 'teacher_id_113', 'teacher_id_127', 'teacher_id_142']

#role_ids_rm_dupes = role_id_combined_tomerge[~role_id_combined_tomerge.role_id.isin(ids_to_rm)].copy()

#Should rm 4
#role_id_combined_tomerge.shape[0] - role_ids_rm_dupes.shape[0]==4

## 1.3. Stitch ids back to original dataset

### 1.3.1. Start with the school dataset first

In [None]:
msgs_school_sent = msg_data[msg_data.broad_type == 'school_sent'].copy()
msgs_school_sent.shape

In [None]:
# Make sure that the senders have an assigned role id
msgs_school_sent_wrole_id = pd.merge(msgs_school_sent, 
                                     role_id_combined_tomerge, 
                                     how = 'left', 
                                     on = ['sender_full_name', 'role'])\
                            .rename(columns = {'role_id': 'sender_role_id'})

# Give the msg receivers their assigned ID
msgs_school_sent_wrole_wparent = pd.merge(msgs_school_sent_wrole_id,
                                          all_parents_togiveids, how = 'left', 
                                          left_on=['receiver_full_name', 'school_merge', 'StudentID'],
                                          right_on=['parent_full_name', 'school_merge', 'StudentID'])\
                                .rename(columns = {'role_id': 'receiver_role_id'})#

In [None]:
# Check a random one, are they the same?
check_names = all_parents_togiveids[all_parents_togiveids.role_id=='parent_id_2153']['parent_full_name'].values == \
              msgs_school_sent_wrole_wparent[msgs_school_sent_wrole_wparent\
                                             .receiver_role_id == 'parent_id_2153']['parent_full_name']\
             .drop_duplicates()

if check_names[0]:
    msgs_school_sent_wrole_wparent.drop(columns='parent_full_name', inplace = True)

In [None]:
if msgs_school_sent_wrole_wparent.shape[0] == msgs_school_sent.shape[0]:
    print('Proceed with parent-sent messages')
else: 
    print('Stop! Something added extra rows')

### 1.3.2. Then the parents

In [None]:
msgs_parent_sent = msg_data[msg_data.broad_type == 'parent_sent'].drop_duplicates().copy()
msgs_parent_sent.shape

In [None]:
# Give parent-sent messages a sender id
msgs_parent_sent_wsender_id = pd.merge(msgs_parent_sent, 
                                       all_parents_togiveids, 
                                       how = 'left',
                                       left_on = ['sender_full_name', 'school_merge', 'StudentID'],
                                       right_on = ['parent_full_name', 'school_merge', 'StudentID'])\
                              .drop(columns = 'parent_full_name')\
                              .rename(columns = {'role_id': 'sender_role_id'})

msgs_parent_sent_wsender_id.shape[0] == msgs_parent_sent.shape[0]


#Give receivers their ID

msgs_parent_sent_wsender_receiver_id = pd.merge(msgs_parent_sent_wsender_id, 
                                                role_id_combined_tomerge\
                                                .rename(columns = {'sender_full_name': 'receiver_full_name'})\
                                                .drop(columns = ['role']),
                                               how = 'left', 
                                               on = 'receiver_full_name')\
                                               .rename(columns = {'role_id':'receiver_role_id'})

msgs_parent_sent_wsender_receiver_id.shape[0] == msgs_parent_sent.shape[0]

## 1.4. Rowbind data together

In [None]:
msgs_wrole_id = msgs_school_sent_wrole_wparent\
                .append(msgs_parent_sent_wsender_receiver_id, ignore_index = True)

In [None]:
#msgs_wrole_id.to_pickle('../data/analysis_data/translated_msgs_wrole_ids_1214.pkl')
#msgs_wrole_id.to_pickle('../data/analysis_data/translated_msgs_wrole_ids_1221.pkl')

if msgs_wrole_id.shape[0] == msg_data.shape[0]:
    print('Proceed')
else:
    print('Something is wrong. Double check')

# 2. Create text chain ids

In [None]:
# Create text chain ids, ensuring that it starts with teh teacher/admin role id

msgs_wrole_id['text_chain_ids'] = \
    np.where(msgs_wrole_id.sender_role_id.str.startswith('parent_'), 
             msgs_wrole_id.receiver_role_id + ':' + msgs_wrole_id.sender_role_id,
             msgs_wrole_id.sender_role_id + ':' + msgs_wrole_id.receiver_role_id)

msgs_wrole_id[['text_chain_ids']].drop_duplicates().head()

# 3. Fix date and time formatting

In [None]:
df_correct_dates_init = pd.read_pickle('../data/analysis_data/full_year_msg_data_pickle_MODIFIED_01282021.pkl')

df_correct_dates = df_correct_dates_init[['id', 'content', 'date_dt', 'file_source']]\
                    .copy().rename(columns = {'date_dt': 'date_dt_corrected'})

In [None]:
msgs_wrole_id.shape
df_correct_dates.shape

In [None]:
msgs_wrole_id = pd.merge(msgs_wrole_id, 
                                     df_correct_dates, 
                                     how = 'left', 
                                     on = ['id', 'content'], 
                                     indicator=True)

msgs_wrole_id.shape

msgs_wrole_id._merge.value_counts()


msgs_wrole_id.drop(columns = ['_merge'], inplace = True)

In [None]:
msgs_wrole_id[msgs_wrole_id.date_dt != msgs_wrole_id.date_dt_corrected]\
[['date', 'time', 'date_dt', 'date_dt_corrected', 'file_source']]

In [None]:
# Which columns do we need to fix?
msgs_wrole_id.columns[msgs_wrole_id.columns.str.startswith('date')]

In [None]:
msgs_wrole_id['date_dt'] = msgs_wrole_id['date_dt_corrected']

msgs_wrole_id['date_time'] = \
 pd.to_datetime(msgs_wrole_id['date_dt_corrected'].astype(str) + ' ' + msgs_wrole_id.time.astype(str))

# round to minutes for goruping
msgs_wrole_id['date_time_minutes'] = msgs_wrole_id.date_time.dt.floor('Min')

# 4. Manually correct incorrect translations

In [None]:
manual_correction = msgs_wrole_id.loc[(msgs_wrole_id.content_w_translation == 'Greasy') |
                                      (msgs_wrole_id.content == 'Grasias')]['id'].to_list()
len(manual_correction)

msgs_wrole_id[(msgs_wrole_id.id.isin(manual_correction))][['content','content_w_translation',
                                                           'detectedSourceLanguage']]

In [None]:
msgs_wrole_id['content_w_translation'] = np.where(msgs_wrole_id.id.isin(manual_correction), 
                                                  'Thank you', 
                                                  msgs_wrole_id.content_w_translation)

msgs_wrole_id[(msgs_wrole_id.id.isin(manual_correction))]\
[['content','content_w_translation','detectedSourceLanguage']].sample(n = 10)


# Check that others haven't been messed up
msgs_wrole_id[~(msgs_wrole_id.id.isin(manual_correction))]\
[['content','content_w_translation','detectedSourceLanguage']].sample(n=15)

# 4. Create a replied to column

In [None]:
# Sort messages by text chain and date/time
msgs_sorted = msgs_wrole_id.sort_values(by = ['StudentID', 'text_chain_ids', 'date_time'])\
              .reset_index(drop = True)

# Created lagged msg columns
msgs_sorted[['replied_to_content', 'replied_to_msg_id', 'replied_to_msg_date_time']] = \
    msgs_sorted.groupby('text_chain_ids')[['content_w_translation', 'id', 'date_time']]\
    .shift(+1)

msgs_sorted['time_diff'] = msgs_sorted.date_time - msgs_sorted.replied_to_msg_date_time

#msgs_sorted_content_only = msgs_sorted[['text_chain_ids', 'sender_role_id', 'receiver_role_id', 
#                                        'date_time', 'id', 'content_w_translation', 'broad_type']]\
#                          .drop_duplicates()


In [None]:
relevant_cols_to_view = ['text_chain_ids', 'sender_role_id', 'receiver_role_id', 
                         'date','date_time', 'id', 'content_w_translation',
                         'replied_to_content', 'replied_to_msg_id', 'replied_to_msg_date_time', 'time_diff']

In [None]:
sample = msgs_sorted.text_chain_ids.drop_duplicates().sample(n=1, random_state = 35).values 
msgs_sorted[msgs_sorted.text_chain_ids == sample[0]][relevant_cols_to_view]

In [None]:
sample = msgs_sorted.text_chain_ids.drop_duplicates().sample(n=1, random_state = 89).values

pd.set_option('display.max_rows', msgs_sorted.shape[0]+1)
msgs_sorted[msgs_sorted.text_chain_ids == sample[0]][relevant_cols_to_view]

#'teacher_id_4:parent_id_2035'

In [None]:
first_msg_in_chain = msgs_sorted.groupby('text_chain_ids')[['id']].first()\
                    .reset_index()\
                    .rename(columns = {'id':'root_id'})

msgs_sorted_w_root = pd.merge(msgs_sorted, 
                              first_msg_in_chain, 
                              how = 'left', 
                              on = 'text_chain_ids')

In [None]:
#msgs_sorted_w_root.head()
# check to see this looks right
msgs_sorted_w_root[msgs_sorted_w_root.text_chain_ids == sample[0]][relevant_cols_to_view + ['root_id']]


In [None]:
msgs_sorted_w_root['replied_to_msg_id'] = np.where(msgs_sorted_w_root.replied_to_msg_id.isna(),
                                                  None,  
                                                  msgs_sorted_w_root.replied_to_msg_id)

In [None]:
msgs_sorted_w_root.shape

In [None]:
# one last date check 
pd.merge(msgs_sorted_w_root.groupby('school_merge')[['date_dt']].min().reset_index(),
         msgs_sorted_w_root.groupby('school_merge')[['date_dt']].max().reset_index(),
         how = 'inner', on = 'school_merge')\
.rename(columns = {'date_dt_x': 'min_date',
                   'date_dt_y': 'max_date'})

# 5. Write as pickle for use in Script #09

In [None]:
todaysdate = date.today().strftime("%m%d%Y")
todaysdate

In [None]:
write_filename = '../data/analysis_data/translated_msgs_wrole_ids_' + todaysdate + '.pkl'
write_filename

In [None]:
msgs_sorted_w_root.to_pickle(write_filename)

In [None]:
# Add all content columns for cleaning (strip escape chars)
content_columns = msgs_sorted_w_root.columns[msgs_sorted_w_root.columns.str.startswith('content')].to_list()
content_columns.remove('content_len')
content_columns.extend(['replied_to_content', 'translatedText', 'translatedText_rm_html'])

content_columns

# replace
for col in content_columns:
    msgs_sorted_w_root[col] = msgs_sorted_w_root[col].str.replace('\r|\n', '')

# 6. Clean and write as csv for use in conference analyses (Scripts #10+)

In [None]:
todaysdate = date.today().strftime("%m%d%Y")
write_filename = '../data/analysis_data/translated_msgs_wrole_ids_' + todaysdate + '.csv' 
write_filename

In [None]:
# write as csv for R script
msgs_sorted_w_root.to_csv(write_filename, index = False)

# Check same N rows
msg_data_check = pd.read_csv(write_filename)
msg_data_check.shape