Loading the packages

In [1]:
import numpy as np  # NumPy: For numerical and array operations.
import pandas as pd  # Pandas: For data manipulation and analysis.
import matplotlib.pyplot as plt  # Matplotlib: For creating various types of plots and charts.
import seaborn as sns  # Seaborn: For making data visualizations more attractive and informative.

Loading data

In [2]:
site_visit = pd.read_csv("data/october-dm-site-visit-3-23.csv")
form_submit1 = pd.read_csv("data/form-submissions-export---V1.csv").drop([4, 5])
form_submit2 = pd.read_csv("data/form-submissions-export---V2.csv")

Identify which drop they are from

- octoberfreetrial - scanned for the first time in drop 1
- octoberfreetrial2 - scanned for the first time in drop 1 (prospects who will receive drop 2 )
- octoberfreetrial3 - scanned drop 2

In [3]:
# Define a function to categorize the 'drop' column
def categorize_drop(row):
    if "octoberfreetrial3" in row:
        return "drop2"
    elif "octoberfreetrial2" in row:  # Corrected typo here
        return "drop1-prospect"
    elif "octoberfreetrial" in row:
        return "drop1"
    else:
        return None 

# Apply the function to create the 'drop' column
site_visit['drop'] = site_visit['Page location'].apply(categorize_drop)


In [4]:
print(site_visit['Page location'].nunique())

82


getting the id for site visit (scanned qr code, submitted the forms)

In [5]:
site_visit['ID'] = site_visit['Page location'].str.extract(r'utm_id=(\d+)')
form_submit1['ID'] = form_submit1['Referrer'].str.extract(r'utm_id=(\d+)')
form_submit2['ID'] = form_submit2['Referrer'].str.extract(r'utm_id=(\d+)')


form_submit1['utm_id'].fillna(form_submit1['Referrer'].str.extract(r'utm_id=(\d+)').iloc[:, 0].astype(float), inplace=True)

form_submit2['utm_id'].fillna(form_submit2['Referrer'].str.extract(r'utm_id=(\d+)').iloc[:, 0].astype(float), inplace=True)


checking how many unique id are there

In [6]:
print(site_visit['ID'].nunique())

78


Extract IDs from form_submit1 as a list


In [7]:
# Extract IDs from form_submit1 as a list
extracted_ids = form_submit1['utm_id'].tolist() + form_submit2['utm_id'].tolist()
formatted_ids = [str(int(id)).zfill(5) if isinstance(id, (int, float)) else id for id in extracted_ids]


# Create the 'submit_form' column in site_visit based on whether the ID is in both DataFrames
site_visit['submit_form'] = site_visit['ID'].isin(formatted_ids)


print(formatted_ids)


['16167', '39699', '03531', '18238', '26156', '44897', '24333', '31533']


checking if they have scanned the qr code more than once

In [8]:
pivoted_df = pd.pivot_table(site_visit, columns='Event name', values='Sessions', index=['ID', 'Date', 'drop', 'submit_form'], aggfunc='first', fill_value=0)
pivoted_df.reset_index(inplace=True)

pivoted_df = pivoted_df.drop('Any_Form-Submit', axis=1)

pivoted_df = pivoted_df[pivoted_df['ID']!='00000']

# Count the number of occurrences of each ID
id_counts = pivoted_df['ID'].value_counts()

# Create the 'visit_again' column based on the ID counts
pivoted_df['visit_again'] = pivoted_df['ID'].map(id_counts) > 1

# Fill any NaN values with False (for IDs that only appear once)
pivoted_df['visit_again'] = pivoted_df['visit_again'].fillna(False)

pivoted_df['Date'] = pd.to_datetime(pivoted_df['Date'], format='%Y%m%d')

# Sort the DataFrame by date
pivoted_df.sort_values(by='Date', inplace=True)

# Keep the earliest date records for duplicate IDs
pivoted_df.drop_duplicates(subset='ID', keep='first', inplace=True)


print(id_counts)

ID
03387    2
26156    2
42331    2
39699    2
35942    2
        ..
30737    1
03531    1
32624    1
05869    1
45145    1
Name: count, Length: 77, dtype: int64


In [9]:
# Left join '手机号码' from form_submit1 to pivoted_df based on 'ID' with suffix '_form1'
pivoted_df = pivoted_df.merge(form_submit1[['ID', '手机号码']], on='ID', how='left')

# Left join '手机号码' from form_submit2 to pivoted_df based on 'ID' with suffix '_form2'
pivoted_df = pivoted_df.merge(form_submit2[['ID', '手机号码']], on='ID', how='left')


In [10]:
from unittest import skip
import phonenumbers


sam_us = pd.read_csv("data/October-Target-SAM-List-US-All.csv",usecols=['Snowball Map', 'Contact Phone','Business Phone','All Phone (Print Shop)'])
sam_canada = pd.read_csv("data/October-Target-SAM-List-CAN-All.csv",usecols=['Snowball Map', 'Contact Phone','Business Phone','All Phone (Print Shop)'])

sam_list = pd.concat([sam_us, sam_canada], ignore_index=True)


In [11]:

# Combine '手机号码' from both '手机号码' columns
pivoted_df['Phone'] = pivoted_df['手机号码_x'].fillna(pivoted_df['手机号码_y'])

# Drop the '手机号码_form1' and '手机号码_form2' columns
pivoted_df.drop(['手机号码_x', '手机号码_y'], axis=1, inplace=True)

pivoted_df['Phone'] = pivoted_df['Phone'].str.replace('-', '', regex=True)


if the call is empty, then fill the phone numbers from the form submitted

In [12]:

import pandas as pd
import numpy as np

# Assuming you have loaded the 'pivoted_df' and 'sam_us' DataFrames
pivoted_df['ID'] = pivoted_df['ID'].astype(np.int64)


# Merge 'pivoted_df' and 'sam_us' based on 'ID' and 'Snowball Map'
merged_df = pd.merge(pivoted_df, sam_list, left_on='ID', right_on='Snowball Map', how='left')

# Define a function to fill 'Phone' column based on priority
def fill_phone(row):
    if not pd.isna(row['Contact Phone']):
        return row['Contact Phone']
    elif not pd.isna(row['Business Phone']):
        return row['Business Phone']
    elif not pd.isna(row['All Phone (Print Shop)']):
        return row['All Phone (Print Shop)']
    else:
        return np.nan

# Apply the function to the 'Phone' column in merged_df
merged_df['Phone'] = merged_df.apply(fill_phone, axis=1)

# Drop the unnecessary columns from 'sam_us' that were merged into 'merged_df'
merged_df.drop(columns=['Snowball Map', 'Contact Phone', 'Business Phone', 'All Phone (Print Shop)'], inplace=True)

# Now, merged_df contains the 'Phone' column filled based on your specified order.

pivoted_df = merged_df.copy()

pivoted_df['Phone'] = pivoted_df['Phone'].str.replace(r'\D', '', regex=True)



using Inbound Call Tracker to match if they scanned and have called

In [13]:
inbound_call = pd.read_csv("data/Inbound-Call-Notes.csv")

# Add '2023' to the 'Date' column
inbound_call['Date'] = '2023 ' + inbound_call['Date']

# Convert the 'Date' column to datetime format
inbound_call['Date'] = pd.to_datetime(inbound_call['Date'], format='%Y %a %m/%d')

# Filter rows for October (month == 10)
october_calls = inbound_call[inbound_call['Date'].dt.month == 10]
october_calls = october_calls.copy()  # Make a copy of the DataFrame
october_calls['Date'] = pd.to_datetime(october_calls['Date'],format='%Y %a %m/%d').dt.strftime('%Y-%m-%d')

october_dm_calls = october_calls[october_calls['Extension Channel']== 'DM']

extracted_phone = october_dm_calls['Phone'].tolist() 
pivoted_df['inbound_call'] = pivoted_df['Phone'].isin(extracted_phone)

october_dm_calls = october_dm_calls.rename(columns={october_dm_calls.columns[16]: 'Campaign'})
october_dm_calls['Campaign'].unique()

october_ft_calls = october_dm_calls[october_dm_calls['Campaign']=='Mkt_DM_Snowflake_OctoberFreeTrial']

df =october_ft_calls.copy()

df['Date'] = pd.to_datetime(df['Date'])

# Define the date ranges for 'drop1' and 'drop2'
start_date_drop1 = pd.to_datetime('2023-10-03')
end_date_drop1 = pd.to_datetime('2023-10-05')
start_date_drop2 = pd.to_datetime('2023-10-18')
end_date_drop2 = pd.to_datetime('2023-10-20')

# Create the 'Period' column based on date ranges
df['Period'] = df.apply(lambda row: 'drop1' if start_date_drop1 <= row['Date'] <= end_date_drop1 else 'drop2', axis=1)
# Get the column names except for the last one
cols_except_last = df.columns[:-1].tolist()
# Reorder the columns as desired
new_order = ['Date', 'Period'] + cols_except_last
df = df[new_order]

# Create a new DataFrame with the desired column order
df.to_csv("data/october_dm_called.csv")

adding missing rows

In [14]:
import pandas as pd

# Assuming you have your existing DataFrame 'pivoted_df'

# Define the data for the two new rows
new_rows_data = [
    {'ID': '31533','Date': '10/12/23', 'drop': 'drop1', 'submit_form': True, 'AnyFormSubmit': 1, 'click': 0, 'cta_click': 0, 'first_visit': 0, 'form_start': 0, 'page_view': 0, 'scroll': 0, 'session_start': 0, 'user_engagement': 0, 'visit_again': False, 'Phone': '7854919016', 'inbound_call': True},
    {'ID': '18238','Date': '10/11/23', 'drop': 'drop1', 'submit_form': True, 'AnyFormSubmit': 1, 'click': 0, 'cta_click': 0, 'first_visit': 0, 'form_start': 0, 'page_view': 0, 'scroll': 0, 'session_start': 0, 'user_engagement': 0, 'visit_again': False, 'Phone': '201-523-6952', 'inbound_call': False}
]

# Convert the list of dictionaries into a DataFrame
new_rows_df = pd.DataFrame(new_rows_data)

# Append the new rows to the 'pivoted_df' DataFrame
pivoted_df = pd.concat([pivoted_df, new_rows_df], ignore_index=True)


# 'ignore_index=True' resets the index of the combined DataFrame.


In [15]:
pivoted_df.to_csv("data/clean_data.csv")