# 0. Imports and functions

In [None]:
## data visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import yaml
import re
import nltk
from nltk import word_tokenize
import datetime
from datetime import datetime


#pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import os
from functools import reduce
from collections import OrderedDict 


import plotnine
from plotnine import *

standard_background = theme(panel_background = element_blank(),   
       panel_grid_major_y = element_blank(),
       axis_text_x = element_text(color = "black", hjust = 1, size = 24),
       axis_text_y = element_text(color = "black", size = 24),
       legend_text = element_text(color = 'black', size = 24),
       legend_title = element_text(color = 'black', size = 24),
       axis_title=element_text(size=24),
       strip_text_x = element_text(size = 12),
       legend_background = element_blank(),
       legend_key = element_blank(),
       panel_grid_major = element_blank(), 
       panel_grid_minor = element_blank(),
       axis_ticks=element_blank())

control_color = "#444444"
treatment_color = "#2B4888"

## cleaner school names
schoolmap_msg = {'Anacostia HS': "Anacostia", 
                 'Columbia Heights EC (CHEC)': "CHEC",
                 'Dunbar HS': 'Dunbar'}

In [None]:

id_crosswalk = pd.read_csv("../data/analysis_data/DCPS Student-Parent Crosswalk.csv")

# 1. Aggregate messaging data from term 1 and term 2, for both first semester and second semester schools

## 1.1. Term 1 messages for first semester schools

### 1.1.1. Load messages

In [None]:
## load outgoing 
msg_data_outgoing = pd.read_csv("../data/analysis_data/DCPS_Outbound_Messages_for_DME_Award_Study.csv") 
msg_data_outgoing.shape

In [None]:
## load incoming messages (parental response)
parent_response_df = pd.read_csv("../data/analysis_data/DCPS_Messages_Incoming_from_Parents.csv")


# for some reason the date/time columns were not named that way
parent_response_df.rename(columns = {'to_char': 'date',
                                     'to_char.1': 'time'}, inplace = True)


### 1.1.2. Prepare messages for concatenating the outgoing and the incoming parent messages

In [None]:
same_cols = list(set(msg_data_outgoing.columns).intersection(parent_response_df.columns))
outgoing_notinc = list(set(msg_data_outgoing.columns).difference(parent_response_df.columns))
inc_notoutgoing = list(set(parent_response_df.columns).difference(msg_data_outgoing.columns))

same_cols
outgoing_notinc
inc_notoutgoing

## dtypes also appear to be similar (eg not yet datetime format, so rowbind)
msg_data_outgoing['broad_type'] = "school_sent"
parent_response_df['broad_type'] = "parent_sent"

# file source
msg_data_outgoing['file_source'] = 'first_sem_schools_outgoing_term1_msgs'
parent_response_df['file_source'] = 'first_sem_schools_incoming_term1_msgs'

msg_data_init = pd.concat([msg_data_outgoing,
                     parent_response_df])

In [None]:
msg_data_outgoing.shape[0] + parent_response_df.shape[0]
msg_data_init.shape

### 1.1.3. Clean up messages for blanks and invalid messages 

In [None]:
# Remove blank_msgs
msg_data_rm_blank = msg_data_init[~msg_data_init.content.isna()].copy()
msg_data_rm_blank.shape


# We know that this one particular chain is between a student and a school admin, which we do not want 
# in our sample. Remove!

msg_data_rm_blank_student = \
    msg_data_rm_blank.loc[~((msg_data_rm_blank['Student ID'] == os.environ['MESSAGE_IDEXCLUDE'] &
                         (msg_data_rm_blank.relationship == 'Unspecified')), ].copy()
msg_data_rm_blank_student.shape

### 1.1.4. Clean up dates

In [None]:
# Term 1 outgoing and incoming messages are day-first
msg_data_rm_blank_student['date_dt'] = pd.to_datetime(msg_data_rm_blank_student.date, 
                                                      dayfirst = True, 
                                                      errors = 'coerce')

msg_data_rm_blank_student.sample(n = 5)[['date', 'date_dt']]


## 1.2. Term 2 Messages for First Semester Schools

### 1.2.1. Load messaging data

In [None]:
# Term 2 outgoing messaging data for first semester schools
term2_remain_outgoing_init = pd.read_excel('../data/analysis_data/DME Messages Term 2 DCPS Pilot Schools Term 1.xlsx',
                                          sheet_name="Sent Messages")
term2_remain_outgoing_init.shape

# Parental response
term2_remain_incoming_init = pd.read_excel('../data/analysis_data/DME Messages Term 2 DCPS Pilot Schools Term 1.xlsx',
                                          sheet_name="Replies")
term2_remain_incoming_init.shape


# file source
term2_remain_outgoing_init['file_source'] = 'first_sem_schools_outgoing_term2_msgs'
term2_remain_incoming_init['file_source'] = 'first_sem_schools_incoming_term2_msgs'

### 1.2.2. Clean up columns in both incoming and outgoing messages for concatenating

In [None]:
term2_remain_outgoing_init.info()

term2_remain_incoming_init.info()

In [None]:
## Which columns don't line up?
same_cols = list(set(term2_remain_outgoing_init.columns).intersection(term2_remain_incoming_init.columns))
outgoing_notinc = list(set(term2_remain_outgoing_init.columns).difference(term2_remain_incoming_init.columns))
inc_notoutgoing = list(set(term2_remain_incoming_init.columns).difference(term2_remain_outgoing_init.columns))

print("Same columns:")
same_cols

print("Cols in outgoing but not incoming:")
outgoing_notinc

print("Cols in incoming but not outgoing:")
inc_notoutgoing

In [None]:
# Looks like the identifier column in incoming messages is referring to SIS ID
term2_remain_incoming_init[term2_remain_incoming_init.identifier.isin(id_crosswalk.SIS_ID)].shape

# Outgoing messages: studentdistrictid = SIS_ID 
term2_remain_outgoing_init[term2_remain_outgoing_init.studentdistrictid.isin(id_crosswalk.SIS_ID)].shape

# Not sure what studentinternal ID is 
term2_remain_outgoing_init[term2_remain_outgoing_init.studentinternalid.isin(id_crosswalk.STATE_ID)].shape

In [None]:
# Rename columns 
# studentdistrictid = SIS_ID 

# For parent messages
term2_remain_incoming_init.rename(columns = {'to_char': 'date',
                                             'to_char.1': 'time', 
                                             'identifier': 'Student ID'}, 
                                  inplace = True)

# For outgoing teacher messages
term2_remain_outgoing_init.rename(columns = {'studentdistrictid': 'Student ID', 
                                             'date2': 'date' },
                                  inplace = True)



# Remove columns 
# date2 is the same as date1, just with day first - i'm keeping this column and 
# removing date1 since the other data are day first as well 

# we don't know what the id or the studentinternalid columns are; they didn't exist in the first send

term2_remain_outgoing_init.drop(columns=['date1', 
                                         'id', 
                                         'studentinternalid'], 
                                axis=1, 
                                inplace = True)
term2_remain_outgoing_init.shape

term2_remain_incoming_init.drop(columns=['id', 'sis_id'], axis = 1, inplace = True)
term2_remain_incoming_init.columns

In [None]:
same_cols = list(set(term2_remain_outgoing_init.columns).intersection(term2_remain_incoming_init.columns))

len(same_cols)

In [None]:
## Add broad_type column that was included in the original msging data
term2_remain_outgoing_init['broad_type'] = "school_sent"
term2_remain_incoming_init['broad_type'] = "parent_sent"

In [None]:
# Remove rows with blank messages

# Filter out messages that are empty
print('N outgoing messages w empty content:', 
       term2_remain_outgoing_init[term2_remain_outgoing_init.content.isna()].shape[0])

term2_remain_outgoing = term2_remain_outgoing_init[~term2_remain_outgoing_init.content.isna()].copy()
print('N messages remaining', term2_remain_outgoing.shape[0])



print('N incoming messages w empty content:', 
      term2_remain_incoming_init[term2_remain_incoming_init.content.isna()].shape[0])
term2_remain_incoming = term2_remain_incoming_init[~term2_remain_incoming_init.content.isna()].copy()
print('N messages remaining', term2_remain_incoming.shape[0])

### 1.2.3. Concatenate outgoing and incoming term 2 messages for first semester schools

In [None]:
# Concat messages
term2_remain_msg_data = pd.concat([term2_remain_outgoing,
                                   term2_remain_incoming])
term2_remain_msg_data.shape


term2_remain_outgoing.shape[0] + term2_remain_incoming.shape[0] == term2_remain_msg_data.shape[0]

# remove sis_id
term2_remain_msg_data.drop(columns = ['sis_id'], axis = 1, inplace = True)

### 1.2.4. Clean up dates – they are month first this time

In [None]:
# Term 1 outgoing and incoming messages are month first
term2_remain_msg_data['date_dt'] = pd.to_datetime(term2_remain_msg_data.date, 
                                                      dayfirst = False)

term2_remain_msg_data.sample(n = 5)[['date', 'date_dt']]

In [None]:
term2_remain_msg_data[['date', 'date_dt']].tail()

In [None]:
max(term2_remain_msg_data.date_dt)

### 1.2.5. Check N columns

In [None]:
# Check if we have the same columns as the 
same_cols = list(set(msg_data_rm_blank_student.columns).intersection(term2_remain_msg_data.columns))

len(same_cols)
len(msg_data_rm_blank_student.columns)
len(term2_remain_msg_data.columns)

## 1.3. Term 2 Messages for Second Semester Schools

### 1.3.1. Load data

In [None]:
# Read in term 2 data; these are the second semester schools
term2_init = pd.read_excel("../data/analysis_data/Term 2 Messaging Data DME Award.xlsx") 
term2_init.shape 

same_cols = list(set(term2_init.columns).intersection(term2_remain_msg_data.columns))
len(same_cols)

print("Same N cols as term 2 schools?", len(same_cols)==term2_remain_msg_data.shape[1])



In [None]:
term2_init.info()

### 1.3.2. Clean up columns

In [None]:
# We know that there is some weird shifting of columns going on

# For those where the "Unnamed: 19" column is empty, the text msgs are in the correct column
term2_correct_content = term2_init[term2_init['Unnamed: 19'].isna()].copy()

# Text messages in the wrong column
term2_incorrect_content = term2_init[~term2_init['Unnamed: 19'].isna()].copy()

term2_correct_content.shape[0] + term2_incorrect_content.shape[0] == term2_init.shape[0]

In [None]:
# Correct
term2_correct_content.shape
term2_correct_content.name.unique()
#term2_correct_content.head()

# Incorrect
term2_incorrect_content.shape
term2_incorrect_content.name.unique()
#term2_incorrect_content.head()

In [None]:
# Create a column name for the bad column


# Get index of the student name column
x = term2_incorrect_content.columns.get_loc('student_name')

# Get the names of the columns beginning 'student_name' and after 
colnames = list(term2_incorrect_content.columns[x:])

# Inert column name at beginning
colnames.insert(0, 'bad_column')

# Remove the last column from the list ( the "Unnamed: 19" one)
colnames.pop(-1)

# Create list of new column names
new_colnames = list(term2_incorrect_content.columns[:x]) + colnames
new_colnames


# Replace column names if N cols match
if len(new_colnames) == term2_incorrect_content.shape[1]:
    term2_incorrect_content.columns = new_colnames
    
#Check
# term2_incorrect_content.head()

In [None]:
# To send to teachertext for verification
term2_incorrect_content.to_csv('../data/share_teachertext/teachertext_to_check_bad_column.csv', index = False)

### 1.3.3. Before stitching df with bad columns back to the original, clean up the dates. 

In [None]:
# For "correct" content, it's month first
term2_correct_content['date_dt'] = pd.to_datetime(term2_correct_content.to_char, 
                                                   dayfirst=False, 
                                                   errors = 'coerce')

term2_correct_content.sample(n = 8)[['to_char', 'date_dt']]

term2_correct_content['file_source'] = 'second_sem_correct_cols'

In [None]:
# For "incorrect" content, it's day first
# But there are some werid formatting stuff going on, so need to do a more manual edit

# Create a fake id column for merging back later on
term2_incorrect_content['dt_index'] = ['id_' + str(i) for i in np.arange(1, 
                                                                         term2_incorrect_content.shape[0]+1)\
                                       .tolist()]

# Grab the ones that start with a year
manual_date_edit_t2 = term2_incorrect_content[(term2_incorrect_content.to_char.apply(str).str.startswith('2020')|
                                              term2_incorrect_content.to_char.apply(str).str.startswith('2019'))]\
                      .copy()

manual_date_edit_t2.shape

# Apply a strptime – the month comes at the end lol
corrected_dates = [pd.to_datetime(datetime.strptime(date, '%Y-%d-%m %H:%M:%S'))\
                   for date in manual_date_edit_t2.to_char.apply(str).to_list()]

if len(corrected_dates) == manual_date_edit_t2.shape[0]:
    print('Proceed') 

manual_date_edit_t2['date_dt'] = corrected_dates
    
# Merge back to dataset
term2_incorrect_content_wdates = pd.merge(term2_incorrect_content,
                                          manual_date_edit_t2[['dt_index', 'date_dt']], 
                                          how = 'left', 
                                          on = 'dt_index')

In [None]:
# For missing dates, do a regular pd dt conversation
term2_incorrect_content_wdates['date_dt'] = np.where(term2_incorrect_content_wdates.date_dt.isna(), 
                                                     pd.to_datetime(term2_incorrect_content.to_char, 
                                                                    dayfirst=True), 
                                                     term2_incorrect_content_wdates.date_dt)

# Check the max
term2_incorrect_content_wdates.date_dt.max()

term2_incorrect_content_wdates.sample(n = 8)[['to_char', 'date_dt']]

term2_incorrect_content_wdates['file_source'] = 'second_sem_incorrect_cols'

term2_incorrect_content_wdates.drop(columns = ['dt_index'], inplace = True)

### 1.3.4. Stitch back corrected columns together for term 2 schools

In [None]:
# Remove columns we don't need
term2_correct_content.drop(columns=['Unnamed: 19'], inplace = True)
term2_incorrect_content_wdates.drop(columns=['bad_column'], inplace = True)

# Concat the original correct content, and the corrected incorrect content
term2_school_msgs = pd.concat([term2_correct_content, 
                              term2_incorrect_content_wdates])

print('Does the cleaned up version have the same number of rows as the version read in? If True, proceed.', 
      term2_school_msgs.shape[0] == term2_init.shape[0])

# Filter out blank messages
term2_school_msgs_rm_blanks = term2_school_msgs[~term2_school_msgs.content.isna()].copy()
print('N rows before removing blanks', term2_school_msgs.shape, 
      '\nN rows after blanks: ',term2_school_msgs_rm_blanks.shape)

term2_school_msgs_rm_blanks.head()

### 1.3.5. Make columns consistent with the first semester schools' dataframes

In [None]:
term2_school_msgs_rm_blanks.role.unique()

# Add the broad type col
term2_school_msgs_rm_blanks['broad_type'] = np.where(term2_school_msgs_rm_blanks.role == 'Parent', 
                                           'parent_sent', 
                                           'school_sent')

# Rename col
term2_school_msgs_rm_blanks.rename(columns = {'to_char': 'date',
                                             'to_char.1': 'time', 
                                             'identifier': 'Student ID'}, inplace = True)

term2_school_msgs_rm_blanks.drop(columns=['id', 'sis_id'], 
                                 axis = 1, 
                                 inplace = True) 

In [None]:
# Check again to see if we have any missing cols
same_cols = list(set(term2_school_msgs_rm_blanks.columns).intersection(term2_remain_msg_data.columns))
len(same_cols)

outgoing_notinc = list(set(term2_school_msgs_rm_blanks.columns).difference(term2_remain_msg_data.columns))
inc_notoutgoing = list(set(term2_remain_msg_data.columns).difference(term2_school_msgs_rm_blanks.columns))

same_cols
outgoing_notinc
inc_notoutgoing


In [None]:
term2_all_msgs = pd.concat([term2_school_msgs_rm_blanks, 
                            term2_remain_msg_data])
term2_all_msgs.shape

In [None]:
term2_remain_msg_data.date_dt.max()

## 1.4. Put Term 1 and Term 2 messaging data for all schools together

In [None]:
# Check again to see if we have any missing cols
same_cols = list(set(msg_data_rm_blank_student.columns).intersection(term2_all_msgs.columns))
len(same_cols)

term1_not_term2 = list(set(msg_data_rm_blank_student.columns).difference(term2_all_msgs.columns))
term2_not_term1 = list(set(term2_all_msgs.columns).difference(msg_data_rm_blank_student.columns))

same_cols
term1_not_term2
term2_not_term1


In [None]:
# Put Term 1 and Term 2 messages together
msg_data = pd.concat([msg_data_rm_blank_student,
                      term2_all_msgs]).drop_duplicates()

msg_data.shape

# 2. Clean different variables and create basic message features

In [None]:
## Convert dates to the appropriate format

msg_data['date_time'] = \
 pd.to_datetime(msg_data['date_dt'].astype(str) + ' ' + msg_data.time.astype(str))  

msg_data['role'].replace({'Admin': 'Administrator'}, inplace = True)
msg_data['relationship'].replace({'parent':'Parent'}, inplace = True)

msg_data['content_upper'] = msg_data.content.astype(str).str.upper()
msg_data['student_upper'] = msg_data.student_name.astype(str).str.upper()

#tokenize student names
msg_data['student_tokens'] = msg_data.student_upper.str.split(' ')

# Some students can have two first names, e.g. Ann Marie Smith
# This should take care of most of the cases

#If len of token > 2, concat first 2 tokens, else just take first token
msg_data['student_2_name'] = np.where(msg_data.student_tokens.str.len() > 2, 
                                      msg_data['student_tokens'].apply(lambda x: x[:2]), 
                                      msg_data['student_tokens'].apply(lambda x: x[:1]))

msg_data['student_2_name'] = msg_data['student_2_name'].str.join(' ')

## one version of student firstname 
## is everything before first space (could be weird if multiple names)
msg_data['student_firstname_spaceversion']  = msg_data.student_upper.str.split(' ').str[0]
name_var = 'student_firstname_spaceversion' # in case we change

## last names
msg_data['student_last_name'] = msg_data.student_upper.str.split(' ').str[-1]
msg_data['receiver_last_name'] = msg_data.receiver_full_name.str.upper().str.split(' ').str[-1]
msg_data['sender_lname'] =  msg_data.sender_full_name.str.upper().str.split(' ').str[-1]
msg_data['receiver_fullname_upper'] = msg_data.receiver_full_name.str.upper()

msg_data.shape

## 2.1. Different versions of messages

Even though only really relevant for the outgoing messages, applied to both for the purposes of consistency

The id part also might flag more stock parent responses ("Thank you!") versus more in-depth responses

In [None]:
def create_possessive(name_var):
    '''
    Fn that takes in a name and returns the 
    original and possessive form of the name
    '''
    student_fname = name_var
    student_fname_possessive = student_fname + "'S"
    return student_fname_possessive, student_fname


def remove_names_alternate(one_row):
    '''
    Process a df, by taking the name vars and creating the possessive version.
    Checks and removes the two-named version first, then repeats for single names
    Ensures that we preserve teachers' names
    (This accounts for cases like, "Jose" as student fname, and Ms. Joseph as teacher lname)
    '''
    # create the possessive version of student names, for both single name and 2-name version 
    two_fname_possessive, two_fname = create_possessive(one_row.loc['student_2_name'])
    fname_possessive, fname = create_possessive(one_row.loc['student_firstname_spaceversion'])
    
    two_names = [two_fname_possessive, two_fname]
    names = [fname_possessive, fname]
    
    sender_lname = one_row.loc['sender_lname']
    titles = ['MR.', 'MRS.', 'MS.', 'MR', 'MRS', 'MS', 'SR', 'SRA', 'SR.', 'SRA.'] # add a couple but dk if they're found
    list_sender_names = [title +' '+ sender_lname for title in titles]
    
    # Look to see if sender's lname exists in search
    sender_name_search = re.search('|'.join(list_sender_names), one_row['content_upper'])
    
    # If the sender's last name doesn't exist in the string, then just remove student name 
    # Otherwise, find the start and end position of the sender's name. 
    # Remove student names from first half of string until starting position of sender name 
    # Remove student names from second half of string starting from the last position of sender name 
    # Concat the 2 halves together (with the sender name added back in)
    
    if sender_name_search is None: 
        msg_noname = re.sub("|".join(two_names), '', one_row.loc['content_upper'])
        msg_noname = re.sub("|".join(names), '', msg_noname)
        
    else:
        start = sender_name_search.start() #find start position of sender name
        end =  sender_name_search.end() #find end position of sender name
        
        # First part of string, remove names of students with two names first
        msg_noname_part1 = re.sub("|".join(two_names), '', one_row.loc['content_upper'][ : start])
        msg_noname_part1 = re.sub("|".join(names), '', msg_noname_part1)
        
        # Second part of string, remove two name first
        msg_noname_part2 = re.sub("|".join(two_names), '', one_row.loc['content_upper'][end : ])
        msg_noname_part2 = re.sub("|".join(names), '', msg_noname_part2)
    
        # concat the strings back together 
        # .group() returns the matched item 
        msg_noname = msg_noname_part1 + sender_name_search.group() + msg_noname_part2
        
    # Rm extra whitespace that might exist as a result
    msg_noname_removews = re.sub(r'\s+', ' ', msg_noname)
    
    return(msg_noname_removews)


def create_newids(variable_forid: str, df: pd.DataFrame):
    ## first check of exists
    new_name = "id_" + variable_forid
    
    if new_name in df.columns:
        print("Already created; skip")
        return(df)
    
    else:
        print("Haven't yet created; create")
        ## subset to that var and dedup
        df_dedup = df[[variable_forid]].drop_duplicates()

        ## create id col
        df_dedup[new_name] = ["id_" + str(i) for i in np.arange(1, df_dedup.shape[0]+1).tolist()]

        ## left join with original df and return
        df_return = pd.merge(df, df_dedup, on = variable_forid, how = "left")
        
        return(df_return)
    
    
def create_newids_wgrouping(variable_forid: str, 
                            grouping_vars_list: list,
                  df: pd.DataFrame):
    ''' 
    Give each unique message a unique ID
    '''
    ## first check of exists
    new_name = "id_" + variable_forid + '_wgroup'
    
    if new_name in df.columns:
        print("Already created; skip")
        return(df)
    
    else:
        #init list
        grouping_vars_list.append(variable_forid)
        ## subset to that var and dedup
        df_dedup = df[grouping_vars_list].drop_duplicates()

        ## create id col
        df_dedup[new_name] = ["id_" + str(i) for i in np.arange(1, df_dedup.shape[0]+1).tolist()]

        ## left join with original df and return
        df_return = pd.merge(df, df_dedup, on = grouping_vars_list, how = "left")
        
        return(df_return)
    
    
def remove_receiver_name_withtitle(one_row):
    '''
    Remove receiver (either parent or teacher) last name from the msg 
    Removes either name or posessive
    '''
    
    one_message = one_row.loc['content_nostudentname']
    receiver_lname = one_row.loc['receiver_last_name']
    receiver_fullname = one_row.loc['receiver_fullname_upper']
    titles = ['MR.', 'MRS.', 'MS.', 'MR', 'MRS', 'MS', 'SR', 'SRA', 'SR.', 'SRA.']
    list_receiver_names = [title +' '+ receiver_lname for title in titles] + [receiver_fullname]
    receiver_name_search = re.search('|'.join(list_receiver_names), one_message)
    if receiver_name_search is None: 
        #debugging: print("name not found, returning: " + one_message)
        return(one_message)
    else:
        msg_norecname = re.sub(receiver_name_search.group(), "", one_message)
        msg_noname_removews = re.sub(r'\s+', ' ', msg_norecname)
        # debugging: print("name found, returning: " + msg_noname_removews)
        return(msg_noname_removews)

In [None]:
# Remove students' names (using function that preserves teacher names)
msg_data['content_nostudentname'] = msg_data.apply(remove_names_alternate, axis = 1)
msg_data['content_noreceiverstudentname'] = msg_data.apply(remove_receiver_name_withtitle, axis = 1)

In [None]:
if len(msg_data.content_nostudentname.unique()) != len(msg_data.content_noreceiverstudentname.unique()):
    print("did something different to the messages; proceed")
else:
    print("same n unique; check your code again pls")

In [None]:
## Give messages a unique ID
msg_data['id'] = ["id_" + str(i) for i in np.arange(1, msg_data.shape[0]+1).tolist()] #give text messages a fake id


## above id is for each message; create new id that creates an id for each set of duplicated messages 
## (before stud name removal)
content_list = ['content_upper','content_nostudentname', 'content_noreceiverstudentname']

# round to minutes for goruping
msg_data['date_time_minutes'] = msg_data.date_time.dt.floor('Min') 

for colname in content_list:
    # for every content column, 
    # create a new id (drops duplicate msgs)
    msg_data = create_newids(variable_forid=colname,
                             df = msg_data)
    print('N Unique IDs: id_'+ colname, msg_data['id_' + colname].nunique())
    
    # for every content column, 
    # create a new id based on sender and date time in minutes
    msg_data = create_newids_wgrouping(variable_forid = colname,
                                       grouping_vars_list=["sender_full_name", "date_time_minutes"],
                                       df = msg_data)
    
    print('N Unique IDs: id_'+ colname, '_wgroup ', msg_data['id_' + colname + '_wgroup'].nunique(), sep = '')
    
    print()

In [None]:
## for a given message, get count of appearances
countappearances_message_removename = \
              pd.DataFrame({'message_id': msg_data.id_content_nostudentname\
                                                    .value_counts().index,
                            'count_appearances_message_removename': msg_data.id_content_nostudentname\
                                                    .value_counts()})

countappearances_message_withname = pd.DataFrame({'message_id': msg_data.id_content_upper.value_counts().index,
                            'count_appearances_message': msg_data.id_content_upper.value_counts()})


## left join onto main messaging data
msg_data_tomerge = msg_data.copy()

msg_data_wcounts = pd.merge(msg_data_tomerge, 
                            countappearances_message_removename, 
                            left_on = ['id_content_nostudentname'],
                            right_on = ['message_id'], 
                            how = 'left')

msg_data_wcounts_more = pd.merge(msg_data_wcounts, 
                                 countappearances_message_withname, 
                                 left_on = ['id_content_upper'],
                                 right_on = ['message_id'], 
                                 how = 'left')
msg_data_wcounts_more.drop(['message_id_x', "message_id_y"], axis = 1, inplace = True)
msg_data_wcounts_more['stock_msg'] = np.where(msg_data_wcounts_more.id_content_nostudentname.duplicated(keep = False),
                                           1, 0)

## reassign
msg_data = msg_data_wcounts_more.copy()

# 3. Create more basic message-level features 

In [None]:
msg_data['n_tokens_rawmsg'] = msg_data.apply(lambda row: len(nltk.word_tokenize(row['content_upper'])), axis=1)

In [None]:
# get string length
msg_data['rawmsg_len'] = msg_data.content_upper.str.len()
msg_data['no_student_len'] = msg_data.content_nostudentname.str.len()
msg_data['no_stud_receiver_len'] = msg_data.content_noreceiverstudentname.str.len()


In [None]:
msg_data['teacher_parent_samelname'] = np.where(msg_data.sender_lname == msg_data.receiver_last_name, 1, 0)
print('Breakdown of parents + teachers with same lname')
msg_data.teacher_parent_samelname.value_counts()

msg_data.rename(columns = {'name': 'school_name'}, inplace = True)
name_list = [name_var, 'receiver_last_name', 'student_last_name']
var_name_list = ['uses_student_name', 'uses_receiver_name', 'uses_student_lname']
for i in range(len(name_list)):
    msg_data[var_name_list[i]] = \
        np.where([x[0] in x[1] for x in zip(msg_data[name_list[i]], msg_data['content_upper'])],
                 1, 0)
    

# 4. Prepare output files

## 4.1. Clean up

In [None]:
# Changing to more efficient method
msg_data.rename(columns={'Student ID': 'StudentID'}, inplace = True)

msg_data['school_merge'] = np.where(msg_data.school_name.str.contains("CHEC"),
                                'CHEC',
                           np.where(msg_data.school_name.str.contains("Anacostia"),
                                'Anacostia', 
                           np.where(msg_data.school_name.str.contains("Dunbar"),
                                 'Dunbar',
                           np.where(msg_data.school_name.str.contains("Paul"),
                                 'Paul',
                           np.where(msg_data.school_name.str.contains("Friendship"),
                                 'Friendship',
                           'Johnson'))))) 

# Check if this is done correctly
msg_data[['school_name', 'school_merge']].drop_duplicates()

In [None]:
msg_data.date_dt.max()

In [None]:
msg_data[msg_data.date_dt == msg_data.date_dt.max()][['date']]

## 4.2. Write outputs as pickles

In [None]:
msg_data.to_pickle('../data/analysis_data/full_year_msg_data_pickle_MODIFIED_01282021.pkl')