# TAL Research
## **EDA: Part 3 of 4** - EDA of Train Data



**Objective**: Develop a small scale model based on the data provided to appropriately help the team prioritize accounts for our sales reps to target.

**What we know**:
- **unique ID** is the id_number, i.e. every id number belongs to a unique company. This is important because it determines how we will consider (concatenate) the different datasets (excel files) with each other.

**Datasets**:
- **Train**: 254 accounts that have been reached out to, with 122 successful sales
- **Test**: 132 target accounts, which have had **no** interacitons yet




# EDA for **training**



#### Imports

In [11]:
import pandas as pd
import seaborn as sns
import sys
import os
from matplotlib import pyplot as plt
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

In [12]:
import pandas as pd

# # Define the paths to the imputed datasets
train_data_path = '../../../data_processed/intersection/train.csv'
test_data_path = '../../../data_processed/intersection/test.csv'

# # Define the paths to the imputed datasets
# train_data_path = '../../../data_processed/intent/train.csv'
# test_data_path = '../../../data_processed/intent/test.csv'
# Load the imputed datasets
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

# Display the first few rows to verify the data
print("Train Data Imputed:")
print(train.shape)

print("Test Data Imputed:")
print(test.shape)


Train Data Imputed:
(254, 106)
Test Data Imputed:
(132, 105)


In [13]:
train.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

In [14]:
y_train = train['target_met']
x_train = train.drop(['target_met'], axis=1)    

In [15]:
from plotly.subplots import make_subplots


# Define the size categories
size_ = ['org_size_1_99', 'org_size_500_999', 'org_size_1000_4999', 'org_size_greaterthan_5000']

# Function to calculate percentages for size categories
def calculate_percentages(data, size_categories):
    percentages = []
    labels = []
    total_companies = len(data.index.unique())

    for s in size_categories:
        perc_ = (data[s].sum() / total_companies) * 100
        percentages.append(perc_)
        labels.append(s)
    
    # Calculate the percentage for the remaining category
    remaining_percentage = 100 - sum(percentages)
    percentages.append(remaining_percentage)
    labels.append('org_size_100_499')
    
    return labels, percentages

# Filter data for target_met = 1
data_target_met_1 = train[train['target_met'] == 1]

# Filter data for target_met = 0
data_target_met_0 = train[train['target_met'] == 0]

# Calculate percentages for target_met = 1
labels_1, percentages_1 = calculate_percentages(data_target_met_1, size_)

# Calculate percentages for target_met = 0
labels_0, percentages_0 = calculate_percentages(data_target_met_0, size_)

# Create subplots for pie charts
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['Target Met = 1', 'Target Met = 0'])

# Add pie chart for target_met = 1
fig.add_trace(go.Pie(labels=labels_1, values=percentages_1, textinfo='label+percent', name='Target Met = 1'), row=1, col=1)

# Add pie chart for target_met = 0
fig.add_trace(go.Pie(labels=labels_0, values=percentages_0, textinfo='label+percent', name='Target Met = 0'), row=1, col=2)

# Update layout
fig.update_layout(title_text="Distribution of Accounts by Organization Size for Target Met")

# Show the plot
fig.show()

In [16]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

train_data = train.copy()

# Function to generate the plots for a given organization size and target_met value
def generate_plots_for_size_and_target(train_data, size_column, target_met_value):
    # Filter data for the given size and target_met value
    size_data = train_data[(train_data[size_column] == 1) & (train_data['target_met'] == target_met_value)]

    # Plot the heatmap on US map
    state_data = size_data['billingstatecode'].value_counts().reset_index()
    state_data.columns = ['State', 'Count']

    # Plot number of accounts in each industry
    industry_data = size_data['Industry'].value_counts().reset_index()
    industry_data.columns = ['Industry', 'Count']

    # Check for negative or invalid values in the 'Number_of_Vehicles__c' column
    size_data = size_data[size_data['Number_of_Vehicles__c'] >= 0]

    # Manually define the bin edges
    bins = [0, 10, 50, 100, 500, 1000, 5000, 10000]
    labels = ['0-10', '11-50', '51-100', '101-500', '501-1000', '1001-5000', '5001-10000']

    # Create bins for the number of vehicles
    size_data['Vehicle_Range'] = pd.cut(size_data['Number_of_Vehicles__c'], bins=bins, labels=labels, right=False)
    vehicle_data = size_data['Vehicle_Range'].value_counts().reset_index()
    vehicle_data.columns = ['Vehicle_Range', 'Count']

    # Determine the largest bubble size
    max_count = vehicle_data['Count'].max()

    # Create text for the bubbles
    bubble_text = [
        f'{count} companies have<br>{label} Vehicles' if count == max_count else f'{count}'
        for count, label in zip(vehicle_data['Count'], vehicle_data['Vehicle_Range'])
    ]

    # Set text positions
    text_positions = [
        'bottom center' if count == max_count and label == '11-50' else 'middle right'
        for count, label in zip(vehicle_data['Count'], vehicle_data['Vehicle_Range'])
    ]

    # Create subplots
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=(
            "Accounts by Industry",
            "Amount of companies owning vehicles - per range of vehicles",
            "Number of Accounts by State"
        ),
        column_widths=[0.33, 0.33, 0.33],
        horizontal_spacing=0.05,  # Reduced horizontal spacing
        specs=[[{"type": "xy"}, {"type": "xy"}, {"type": "choropleth"}]]
    )

    # Add the bar plot to the subplot
    fig.add_trace(
        go.Bar(
            x=industry_data['Count'],
            y=industry_data['Industry'],
            orientation='h'
        ),
        row=1, col=1
    )

    # Add the bubble plot to the subplot
    fig.add_trace(
        go.Scatter(
            x=vehicle_data['Vehicle_Range'],
            y=vehicle_data['Count'],
            mode='markers+text',
            text=bubble_text,
            textposition=text_positions,
            marker=dict(size=vehicle_data['Count'], sizemode='area', sizeref=2.*max_count/(60.**2), sizemin=4),
            textfont=dict(size=14)  # Increased font size
        ),
        row=1, col=2
    )

    # Add the choropleth map to the subplot
    fig.add_trace(
        go.Choropleth(
            locations=state_data['State'],
            z=state_data['Count'],
            locationmode='USA-states',
            colorscale="Viridis",
            colorbar_title="Count"
        ),
        row=1, col=3
    )
    fig.update_geos(projection_scale=5, center=dict(lat=37.0902, lon=-95.7129), row=1, col=3)

    # Update layout
    fig.update_layout(
        title={
            'text': f"<b>Analysis of Accounts from {size_column} with target_met = {target_met_value}</b>",
            'y': 0.98,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font_size': 20
        },
        showlegend=False,
        height=400,  # Reduced height
        width=1600,  # Reduced width
        margin=dict(l=20, r=20, t=80, b=20)  # Reduced top margin
    )

    # Update subplot title font sizes
    fig.update_annotations(font_size=16)

    fig.show()

# List of organization sizes to analyze
sizes = ['org_size_1_99', 'org_size_500_999', 'org_size_1000_4999', 'org_size_greaterthan_5000']

# Generate plots for each size and target_met value
for size in sizes:
    generate_plots_for_size_and_target(train_data, size, 1)  # target_met = 1
    generate_plots_for_size_and_target(train_data, size, 0)  # target_met = 0


In [18]:
import pandas as pd
import plotly.graph_objects as go

# Load the imputed training dataset
train_data = train.copy()

# Function to calculate segment counts
def calculate_segment_counts(data):
    total_companies = len(data.index.unique())
    segment_e_count = data['Segment_E'].sum()
    segment_e3_count = data['SegmentE3'].sum()
    segment_a1_count = data['SegmentA1'].sum()
    segment_a2_count = data['SegmentA2'].sum()

    categories_segments = ['Segment_E', 'SegmentE3', 'SegmentA1', 'SegmentA2']
    condition = (data[categories_segments] == 0).all(axis=1)
    none_segment_count = data[condition].shape[0]

    return {
        'total_companies': total_companies,
        'segment_e_count': segment_e_count,
        'segment_e3_count': segment_e3_count,
        'segment_a1_count': segment_a1_count,
        'segment_a2_count': segment_a2_count,
        'none_segment_count': none_segment_count
    }

# Calculate segment counts for target_met = 1
data_target_met_1 = train_data[train_data['target_met'] == 1]
segment_counts_1 = calculate_segment_counts(data_target_met_1)

# Calculate segment counts for target_met = 0
data_target_met_0 = train_data[train_data['target_met'] == 0]
segment_counts_0 = calculate_segment_counts(data_target_met_0)

# Data for pie charts
labels = ['Segment E', 'Segment E3', 'Segment A1', 'Segment A2', 'None']
values_1 = [
    segment_counts_1['segment_e_count'],
    segment_counts_1['segment_e3_count'],
    segment_counts_1['segment_a1_count'],
    segment_counts_1['segment_a2_count'],
    segment_counts_1['none_segment_count']
]
values_0 = [
    segment_counts_0['segment_e_count'],
    segment_counts_0['segment_e3_count'],
    segment_counts_0['segment_a1_count'],
    segment_counts_0['segment_a2_count'],
    segment_counts_0['none_segment_count']
]

# Create subplots for pie charts
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['Target Met = 1', 'Target Met = 0'])

# Add pie chart for target_met = 1
fig.add_trace(go.Pie(labels=labels, values=values_1, textinfo='label+percent', name='Target Met = 1'), row=1, col=1)

# Add pie chart for target_met = 0
fig.add_trace(go.Pie(labels=labels, values=values_0, textinfo='label+percent', name='Target Met = 0'), row=1, col=2)

# Update layout
fig.update_layout(title_text="Distribution of Companies by Segments for Target Met")

# Show the plot
fig.show()


In [19]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Calculate the number of unique companies
total_companies = len(train['id_number'].unique())

# Group by id_number and calculate engaged companies
def calculate_engagement_counts(df):
    num_companies_replied = (df['reply_count'] > 0).sum()
    num_companies_clicked = (df['click_count'] > 0).sum()
    return num_companies_replied, num_companies_clicked

# Calculate engagement counts for target_met = 1
data_target_1 = train[train['target_met'] == 1]
num_companies_replied_1, num_companies_clicked_1 = calculate_engagement_counts(data_target_1)
total_companies_1 = len(data_target_1)

# Calculate engagement counts for target_met = 0
data_target_0 = train[train['target_met'] == 0]
num_companies_replied_0, num_companies_clicked_0 = calculate_engagement_counts(data_target_0)
total_companies_0 = len(data_target_0)

# Data for pie charts
labels = ['Engaged', 'Not Engaged']
values_replies_1 = [num_companies_replied_1, total_companies_1 - num_companies_replied_1]
values_clicks_1 = [num_companies_clicked_1, total_companies_1 - num_companies_clicked_1]
values_replies_0 = [num_companies_replied_0, total_companies_0 - num_companies_replied_0]
values_clicks_0 = [num_companies_clicked_0, total_companies_0 - num_companies_clicked_0]

# Create subplots for pie charts
fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}], [{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=['Replies (Target Met = 1)', 'Clicks (Target Met = 1)', 'Replies (Target Met = 0)', 'Clicks (Target Met = 0)'])

# Add pie charts for target_met = 1
fig.add_trace(go.Pie(labels=labels, values=values_replies_1, name="Replies (Target Met = 1)"), row=1, col=1)
fig.add_trace(go.Pie(labels=labels, values=values_clicks_1, name="Clicks (Target Met = 1)"), row=1, col=2)

# Add pie charts for target_met = 0
fig.add_trace(go.Pie(labels=labels, values=values_replies_0, name="Replies (Target Met = 0)"), row=2, col=1)
fig.add_trace(go.Pie(labels=labels, values=values_clicks_0, name="Clicks (Target Met = 0)"), row=2, col=2)

# Update layout
fig.update_layout(title_text="Distribution of Companies Engaged in Replies and Clicks by Target Met")

# Show the plot
fig.show()



In [20]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math

train_data = train.copy()
# Ensure 'last_action_date' is in datetime format
train_data['last_action_date'] = pd.to_datetime(train_data['last_action_date'])

# Extract components of 'last_action_date'
train_data['month_last'] = train_data['last_action_date'].dt.month
train_data['year_last'] = train_data['last_action_date'].dt.year
train_data['day_last'] = train_data['last_action_date'].dt.day
train_data['hour_last'] = train_data['last_action_date'].dt.hour

# Define function to generate combined bar plots
def generate_combined_bar_plot(data, column, top_n=None):
    data_1 = data[data['target_met'] == 1]
    data_0 = data[data['target_met'] == 0]

    if top_n:
        data_1 = data_1[column].value_counts().nlargest(top_n).reset_index()
        data_0 = data_0[column].value_counts().nlargest(top_n).reset_index()
    else:
        data_1 = data_1[column].value_counts().reset_index()
        data_0 = data_0[column].value_counts().reset_index()

    data_1.columns = [column, 'count']
    data_0.columns = [column, 'count']

    bar_1 = go.Bar(name='Target Met = 1', x=data_1[column], y=data_1['count'], marker_color='blue')
    bar_0 = go.Bar(name='Target Met = 0', x=data_0[column], y=data_0['count'], marker_color='orange')

    return bar_1, bar_0

# Define the variables to plot
variables = [
    'Industry', 'billingstatecode', 'click_count', 'reply_count', 'D&B_Score', 
    'Has_Website', 'month_last', 'year_last', 'day_last', 'hour_last'
]
titles = [
    'Industry', 'Billing State Code', 'Click Count', 'Reply Count', 'D&B Score', 
    'Has Website', 'Month of Last Action', 'Year of Last Action', 'Day of Last Action', 'Hour of Last Action'
]

# Calculate the number of rows needed (3 plots per row)
num_rows = math.ceil(len(variables) / 3)

# Create subplots
fig = make_subplots(rows=num_rows, cols=3, subplot_titles=titles)

# Add combined bar plots for each variable
for i, variable in enumerate(variables):
    row = (i // 3) + 1
    col = (i % 3) + 1
    top_n = 8 if variable == 'billingstatecode' else None
    bar_1, bar_0 = generate_combined_bar_plot(train_data, variable, top_n)
    fig.add_trace(bar_1, row=row, col=col)
    fig.add_trace(bar_0, row=row, col=col)

# Update layout
fig.update_layout(barmode='group', height=400 * num_rows, width=1200, title_text="Distribution of Variables by Target Met")

# Show the plot
fig.show()



In [21]:
# Define the length variables to plot
length_variables = [
    'engagement_length_days', 'email_interaction_length_days', 'intent_length_days'
]
length_titles = [
    'Engagement Length (Days)', 'Email Interaction Length (Days)', 'Intent Length (Days)'
]

# Create subplots for length variables
length_fig = make_subplots(rows=1, cols=3, subplot_titles=length_titles)

# Add histogram plots for each length variable
for i, variable in enumerate(length_variables):
    col = i + 1
    hist_1 = go.Histogram(name='Target Met = 1', x=train_data[train_data['target_met'] == 1][variable], marker_color='blue', opacity=0.75)
    hist_0 = go.Histogram(name='Target Met = 0', x=train_data[train_data['target_met'] == 0][variable], marker_color='orange', opacity=0.75)
    length_fig.add_trace(hist_1, row=1, col=col)
    length_fig.add_trace(hist_0, row=1, col=col)

# Update layout
length_fig.update_layout(barmode='overlay', height=400, width=1200, title_text="Distribution of Length Variables by Target Met")

# Show the plot
length_fig.show()

**End of Notebook**