# Data Loading and Preparation

## Import libraries and load data

In [161]:
import pandas as pd
import numpy as np
from datetime import datetime

pd.set_option('display.max_columns', None) 
pd.set_option('display.max_colwidth', None)

In [162]:
df = pd.read_csv('data/twcs/twcs.csv')
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist you. We would need to get you into a private secured link to further assist.,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messages and no one is responding as usual,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so that we can further assist you. Just click ‘Message’ at the top of your profile.,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


## Correct the format of the columns

In [163]:
df['tweet_id'] = df['tweet_id'].astype(str)
df['in_response_to_tweet_id'] = df['in_response_to_tweet_id'].astype(str)
df['response_tweet_id'] = df['response_tweet_id'].astype(str)
df['author_id'] = df['author_id'].astype(str)
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')

# replace non standard missing values with np.nan
df.replace(['', 'nan', None], np.nan, inplace=True)

## Tweet conversations and subthreads detection

In [164]:
# Step 1: Pre-process the DataFrame to create helpful mappings
response_mapping = df.set_index('tweet_id')['response_tweet_id'].dropna().to_dict()

# set index 'tweet_id'
df = df.set_index('tweet_id', drop=False)

# Initializing new columns
df['conversation_id'] = pd.Series(np.nan, dtype='object')
df['subthread_id'] = pd.Series(np.nan, dtype='object')
df['parent_id'] = pd.Series(np.nan, dtype='object')
df['depth'] = pd.Series(np.nan, dtype='float')  

# Initialize a dictionary to hold conversation information temporarily
conversation_info = {}

def assign_conversation_info(tweet_id, conversation_id, subthread_path, parent_id=np.nan, current_depth=0):
    if tweet_id in conversation_info:  # Skip if already processed
        return
    
    # Generate a subthread ID based on the current subthread path
    subthread_id = ','.join(map(str, subthread_path)) if subthread_path else np.nan
    
    conversation_info[tweet_id] = {
        'conversation_id': str(conversation_id),
        'subthread_id': subthread_id,
        'parent_id': parent_id,
        'depth': current_depth
    }
    
    # Process responses
    responses = response_mapping.get(tweet_id, "")
    if responses:
        for idx, response_id in enumerate(responses.split(',')):
            new_path = subthread_path + [idx + 1] if subthread_path else []
            assign_conversation_info(response_id.strip(), conversation_id, new_path, tweet_id, current_depth + 1)

# Assign unique conversation IDs
# Starting tweets
start_tweets = df[(df['in_response_to_tweet_id'].isnull()) & (df['inbound'] == True)]
print(f"The number of conversations in the dataset is: {start_tweets.shape[0]}")

conversation_counter = 1  # Initialize conversation counter
for i, row in start_tweets.iterrows():
    if conversation_counter > 100:  # Break the loop after processing 100 conversations
        break
    assign_conversation_info(row['tweet_id'], conversation_counter, [conversation_counter])
    conversation_counter += 1


# Step 2: Batch update the DataFrame using the conversation_info dictionary
for tweet_id, info in conversation_info.items():
    idx = df.index[df['tweet_id'] == tweet_id]
    df.loc[idx, 'conversation_id'] = info['conversation_id']
    df.loc[idx, 'subthread_id'] = info['subthread_id']
    df.loc[idx, 'parent_id'] = info['parent_id']
    df.loc[idx, 'depth'] = info['depth']

# Sort by conversation_id and time for better readability
df.sort_values(by=['conversation_id', 'created_at'], inplace=True)

# drop rows that don't have 'conversation_id'
df = df.dropna(subset=['conversation_id'])


The number of conversations in the dataset is: 787346


In [168]:
df.head()

Unnamed: 0_level_0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id,subthread_id,parent_id,depth
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8,8,115712,True,2017-10-31 21:45:10+00:00,@sprintcare is the worst customer service,9610.0,,1,1,,0.0
10,10,sprintcare,False,2017-10-31 21:45:59+00:00,@115712 Hello! We never like our customers to feel like they are not valued.,,8.0,1,13,8.0,1.0
9,9,sprintcare,False,2017-10-31 21:46:14+00:00,@115712 I would love the chance to review the account and provide assistance.,,8.0,1,11,8.0,1.0
6,6,sprintcare,False,2017-10-31 21:46:24+00:00,"@115712 Can you please send us a private message, so that I can gain further details about your account?",57.0,8.0,1,12,8.0,1.0
7,7,115712,True,2017-10-31 21:47:48+00:00,@sprintcare the only way I can get a response is to tweet apparently,,6.0,1,122,6.0,2.0


## Column preparation for causal inference analysis

In [170]:
print(df.columns)

Index(['tweet_id', 'author_id', 'inbound', 'created_at', 'text',
       'response_tweet_id', 'in_response_to_tweet_id', 'conversation_id',
       'subthread_id', 'parent_id', 'depth'],
      dtype='object')


In [173]:
df.head(2)

Unnamed: 0_level_0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,conversation_id,subthread_id,parent_id,depth
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
8,8,115712,True,2017-10-31 21:45:10+00:00,@sprintcare is the worst customer service,9610.0,,1,1,,0.0
10,10,sprintcare,False,2017-10-31 21:45:59+00:00,@115712 Hello! We never like our customers to feel like they are not valued.,,8.0,1,13,8.0,1.0
