In [1]:
import pandas as pd
import numpy as np
import re
from datetime import date

data_path = '/Users/tony/Desktop/outbound/data/input_data.csv'
output_url = '/Users/tony/Desktop/outbound/data/master_data.csv'


data = pd.read_csv(data_path)

#function to remove unwanted columns


def remove_unwanted_columns(data):
    columns_to_keep = [
        'Domain', 'Name', 'Email', 'Cell/Mobile Number', 'Job Title', 'Personal Linkedin',
        'Duration', 'Number of Visits', 'Visitor Location', 'Department', 'Seniority',
        'Industry', 'Founded Year', 'Estimated Annual Revenue', 'Type of contact',
        'Employees Range', 'Sector', 'company.name', 'Company.domain', 'Company.sector',
        'Company.industry', 'Company.country', 'company.linkedinPage', 'company.lastVisit',
        'leadId', 'leadLink', 'Summary.visits', 'Summary.duration',
        'Summary.utm_source', 'Summary.utm_medium'
    ]
    
    return data[[col for col in columns_to_keep if col in data.columns]]

# Apply function to data
data = remove_unwanted_columns(data)

In [2]:
#rename columns to better fit the needs
def rename_columns(data):
    """Renames columns in the given DataFrame."""
    column_renames = {
        "Domain Name": "website",
        "Email": "contact_email",
        "Cell/Mobile Number": "mobile_number",
        "Job Title": "position",
        "Personal Linkedin": "linkedin_profile",
        "Duration": "visit_duration",
        "Number of Visits": "visit_count",
        "Visitor Location": "location",
        "Department": "department",
        "Estimated Annual Revenue": "revenue_range",
        "Type of contact": "contact_status",
        "Employees Range": "employee_count_range",
        "Sector": "industry",
        "company.name": "company_name",
        "company.linkedinPage": "company_linkedin",
        "company.lastVisit": "last_visit_date",
        "leadId": "lead_id",
        "leadLink": "lead_url",
        "summary.leadScore": "lead_score"
    }
    
    return data.rename(columns=column_renames)

data = rename_columns(data)


In [3]:
#convert visit duration "visit_duration" to seconds

def convert_duration_to_seconds(data, column_name, new_column_name):
    def duration_to_seconds(duration):
        if isinstance(duration, str) and ':' in duration:
            h, m, s = map(int, duration.split(':'))
            return h * 3600 + m * 60 + s
        return 0  # Default for invalid or missing data
    
    data[new_column_name] = data[column_name].apply(duration_to_seconds)
    return data

# Apply function with the new column name "time_spent_on_site"
data = convert_duration_to_seconds(data, "visit_duration", "time_spent_on_site")
data = data.drop(columns=["visit_duration"])

In [4]:
def process_and_create_revenue_column(data, input_column="revenue_range", output_column="revenue"):
    """Standardizes revenue and creates a new column in the DataFrame."""

    def standardize_annual_revenue(revenue_range):
        """Standardizes a revenue range string, returning the higher value with its suffix."""

        if not isinstance(revenue_range, str):
            return "0"  # Handle non-string input

        match = re.match(r"\$([\d.]*[KM]?)[\s-]*\$([\d.]*[KM]?)", revenue_range, re.IGNORECASE)
        if match:
            high_revenue = match.group(2).upper()
            return high_revenue

        return "0"  # Default for invalid format

    data[output_column] = data[input_column].apply(standardize_annual_revenue)
    return data

data = process_and_create_revenue_column(data) # Process and create 'revenue' column
#drop the original revenue column
data = data.drop(columns=["revenue_range"])


In [5]:
def convert_revenue_to_millions(revenue_str):
    """Converts a revenue string (e.g., '5M', '100K', '1B') to millions."""

    if isinstance(revenue_str, (int, float)): # handles if the input is a number already
        return revenue_str

    if not isinstance(revenue_str, str):  # Handle non-string input
        return 0

    revenue_str = revenue_str.upper()  # Handle lowercase (k, m, b)

    match = re.match(r"([\d.]+)([KM]?)", revenue_str)  # Improved regex
    if match:
        amount = float(match.group(1))
        multiplier = match.group(2)

        if multiplier == 'K':
            return amount / 1000
        elif multiplier == 'M':
            return amount
        elif multiplier == 'B':
            return amount * 1000
        else: # handles if there is no multiplier present (e.g. 1000000)
          try:
            return float(revenue_str)/1000000
          except ValueError:
            return 0
    return 0  # Default for invalid format



def create_revenue_in_millions_column(data, input_column="revenue", output_column="revenue_in_million"):
    """Creates a new 'revenue_in_million' column."""
    data[output_column] = data[input_column].apply(convert_revenue_to_millions)
    return data


data = create_revenue_in_millions_column(data)
data = data.drop(columns=['revenue'])


In [6]:
def convert_employee_range(employee_range):
    """Converts an employee count range string to a single number (higher value or integer)."""

    if pd.isna(employee_range):  # Check for NaN using pandas.isna()
        return 0  # Return 0 for NaN values

    if isinstance(employee_range, (int, float)):  # Check if it's already a number
        return int(employee_range)

    if not isinstance(employee_range, str):
        return 0  # Handle other non-string/non-numeric input

    match = re.search(r"- (\d+(?:,\d+)?)", employee_range)
    if match:
        try:
            count = int(match.group(1).replace(",", ""))
            return count
        except ValueError:
            return 0
    else:
        try:
            count = int(employee_range.replace(",", ""))
            return count
        except ValueError:
            return 0

def create_employee_count_column(data, input_column="employee_count_range", output_column="employee_count"):
    """Creates a new 'employee_count' column."""
    data[output_column] = data[input_column].apply(convert_employee_range)
    return data

data = create_employee_count_column(data)
data = data.drop(columns=["employee_count_range"])



In [7]:


def extract_date(timestamp):
    
    """Extracts the date (YYYY-MM-DD) from a timestamp string."""
    if not isinstance(timestamp, str):  # Handle non-string input (including NaT)
        return None

    match = re.match(r"(\d{4}-\d{2}-\d{2})", timestamp)  # Improved Regex
    if match:
        return match.group(1)
    return None  # Return None for invalid timestamps

def create_last_visited_date_column(data, input_column="last_visit_date", output_column="last_visited_date"):
    """Creates a new 'last_visited_date' column."""
    data[output_column] = data[input_column].apply(extract_date)
    return data


data = create_last_visited_date_column(data)
data = data.drop(columns=['last_visit_date'])



In [8]:
#rearrage column order

def reorder_columns(data, desired_order):
    """Reorders columns in a DataFrame."""
    # Ensure all desired columns are present (handle missing columns):
    for col in desired_order:
        if col not in data.columns:
            data[col] = None  # Or some other default value like "" or np.nan

    # Reorder the columns (handles potential KeyError if column is not present):
    existing_columns = [col for col in desired_order if col in data.columns]
    data = data[existing_columns]
    return data

# Your desired column order:
desired_order = ['last_visited_date', 'Domain', 'Name', 'revenue_in_million', 'employee_count', 'contact_email', 'mobile_number', 'position', 'time_spent_on_site', 'linkedin_profile', 'visit_count', 'location', 'department', 'Seniority', 'Industry', 'Founded Year', 'contact_status', 'industry', 'company_name', 'company_linkedin', 'lead_id', 'lead_url']

data = reorder_columns(data, desired_order)


In [9]:

def calculate_visits_score(num_visits):
    """Calculates a visits score based on the number of visits."""
    if num_visits >= 20:
        return 50
    elif num_visits >= 10:
        return 40
    elif num_visits >= 6:
        return 30
    elif num_visits >= 4:
        return 20
    elif num_visits >= 2:
        return 10
    else:
        return 2

def add_visits_score_column(data, visits_column="visit_count", score_column="visits_score"): # uses data
    """Adds a visits score column to the DataFrame."""
    data[score_column] = data[visits_column].apply(calculate_visits_score) # uses data
    return data


data = add_visits_score_column(data)  # Using default column names


In [10]:

def add_scoring_columns(data):
    # Industry relevance score
    relevant_industries = [
        "Computer Software", "Marketing & Advertising", "Professional Services", "Information Technology & Services",
        "Management Consulting", "IT Services and IT Consulting", "Information Technology and Services",
        "Software Development", "Luxury Goods & Jewelry", "Real Estate", "Internet Software & Services",
        "Retail", "Hospitality", "Sports", "Airlines/Aviation", "Manufacturing", "Computer & Network Security"
    ]
    data["industry_score"] = data["Industry"].apply(
        lambda x: 20 if x in relevant_industries else 0
    )
    
    return data

data = add_scoring_columns(data)

In [11]:

def calculate_region_score(location):
    tier_1 = [
        "United States", "Canada", "United Kingdom", "Germany", "France", "Italy", "Spain",
        "Australia", "Netherlands", "Sweden", "Norway", "Switzerland", "Denmark", "Belgium",
        "Finland", "Japan", "Singapore", "New Zealand", "South Korea", "Israel", "Luxembourg",
        "Ireland", "Austria"
    ]

    tier_2 = ["India", "China", "UAE", "Saudi Arabia"]

    tier_3 = [
        "Malaysia", "Indonesia", "Brazil", "South Africa", "Vietnam", "Philippines", "Thailand",
        "Mexico", "Turkey", "Argentina", "Poland", "Romania", "Czech Republic", "Morocco",
        "Kenya", "Nigeria", "Egypt", "Colombia", "Peru", "Chile"
    ]

    if pd.isna(location):  # Handle NaN or None values
        return 0

    return 30 if location in tier_1 else (20 if location in tier_2 else (10 if location in tier_3 else 0))


def add_location_score_column(df, location_column="location", new_column_name="location_score"):
    """Adds a location score column to the DataFrame."""
    df[new_column_name] = df[location_column].apply(calculate_region_score)
    return df


# Add the location score column:
data = add_location_score_column(data)  # Uses default column names

In [12]:
def score_role(position):
    if pd.isnull(position):
        return 0
    position = position.lower()
    if any(title in position for title in ["c-level", "ceo", "cto", "cfo"]):
        return 30
    elif any(title in position for title in ["vp", "vice president", "director"]):
        return 20
    elif "manager" in position:
        return 10
    else:
        return 5

data["role_score"] = data["position"].apply(score_role)


In [13]:
def calculate_visits_score(num_visits):
    """Calculates a visits score based on the number of visits."""
    if num_visits >= 20:
        return 50
    elif num_visits >= 10:
        return 40
    elif num_visits >= 6:
        return 30
    elif num_visits >= 4:
        return 20
    elif num_visits >= 2:
        return 10
    else:
        return 2

def add_visits_score_column(data, visits_column="visit_count", score_column="number_of_visit_score"): # Corrected column name
    """Adds a visits score column to the DataFrame."""
    data[score_column] = data[visits_column].apply(calculate_visits_score) # Corrected column name
    return data


data = add_visits_score_column(data, visits_column="visit_count", score_column="number_of_visit_score") # Example with different column names

In [14]:

def calculate_visit_duration_score(duration_seconds):
    """Calculates a visit duration score based on the duration in seconds."""
    if duration_seconds > 60:
        return 30
    elif duration_seconds > 30:
        return 20
    elif duration_seconds > 10:
        return 10
    else:
        return 5

def add_visit_duration_score_column(data, duration_column="time_spent_on_site", score_column="visit_duration_score"): # Corrected column name
    """Adds a visit duration score column to the DataFrame."""

    # Attempt to convert to numeric, handling errors:
    try:
        data[duration_column] = pd.to_numeric(data[duration_column], errors='coerce') # Convert to numeric, errors to NaN
    except Exception as e:
        print(f"Error converting column to numeric: {e}")
        return data # Return the DataFrame as is if conversion fails

    data[score_column] = data[duration_column].apply(calculate_visit_duration_score)
    return data


data = add_visit_duration_score_column(data)  # Using default column names
#


In [15]:
#company size score 


def calculate_company_size_score(employees):
    """Calculates a company size score based on the number of employees."""
    if pd.isna(employees):  # Handle missing/NaN values
        return 0

    try:
        employees = int(employees)  # Convert to integer, handle potential errors
    except (ValueError, TypeError):
        return 0

    if employees >= 10000:
        return 50
    elif 1000 <= employees < 10000:
        return 40
    elif 500 <= employees < 1000:
        return 30
    elif 100 <= employees < 500:
        return 20
    elif 50 <= employees < 100:
        return 10
    elif 1 <= employees < 50:
        return 5
    else:  # Includes employees < 1 and other invalid cases.
        return 0


def add_company_size_score_column(data, employees_column="employee_count", score_column="employee_count_score"):
    """Adds a company size score column to the DataFrame."""

    # Convert to numeric, handling errors:
    try:
        data[employees_column] = pd.to_numeric(data[employees_column], errors='coerce') # Convert to numeric, errors to NaN
    except Exception as e:
        print(f"Error converting column to numeric: {e}")
        return data # Return the DataFrame as is if conversion fails

    data[score_column] = data[employees_column].apply(calculate_company_size_score)
    return data



data = add_company_size_score_column(data)

In [16]:
def calculate_contact_score(linkedin_profile, contact_email):
    """Calculates a contact score based on LinkedIn profile and email presence."""
    if pd.notna(linkedin_profile) and pd.notna(contact_email):  # Both present
        return 30
    elif pd.notna(linkedin_profile) or pd.notna(contact_email):  # At least one present
        return 20
    else:  # Neither present
        return 0

def add_contact_score_column(data, linkedin_column="linkedin_profile", email_column="contact_email", score_column="contact_score"):
    """Adds a contact score column to the DataFrame."""
    data[score_column] = data.apply(lambda row: calculate_contact_score(row[linkedin_column], row[email_column]), axis=1)
    return data


data = add_contact_score_column(data)

In [17]:

def calculate_last_visit_score(last_visited_date_str):  # Changed parameter name for clarity
    """Calculates a score based on the number of days since the last visit."""

    if pd.isna(last_visited_date_str):  # Handle missing values (NaT or None)
        return 0

    try:
        last_visited_date = date.fromisoformat(last_visited_date_str)  # Directly parse YYYY-MM-DD
        today = date.today()
        days_difference = (today - last_visited_date).days

        if 1 <= days_difference <= 7:
            return 50
        elif 8 <= days_difference <= 14:
            return 40
        elif 15 <= days_difference <= 30:
            return 30
        elif 31 <= days_difference <= 45:
            return 20
        elif 46 <= days_difference <= 60:
            return 10
        else:
            return 0

    except (ValueError, TypeError):  # Handle invalid date formats or other errors
        return 0


def add_last_visit_score_column(data, date_column="last_visited_date", score_column="last_visit_score"):
    """Adds a last visit score column to the DataFrame."""

    data[score_column] = data[date_column].apply(calculate_last_visit_score)
    return data
data = add_last_visit_score_column(data)

In [18]:
#days since last visit


# Convert 'last_visited_date' to datetime
data['last_visited_date'] = pd.to_datetime(data['last_visited_date'], format='%Y-%m-%d')

# Define a function that calculates the days since the last visit
def calculate_days_since(visit_date):
    # Get today's date (normalized to remove the time component)
    today = pd.to_datetime('today').normalize()
    # Calculate and return the difference in days
    return (today - visit_date).days

# Apply the function to the 'last_visited_date' column and create a new column
data['days_since_last_visit'] = data['last_visited_date'].apply(calculate_days_since)

# Display the updated dataframe

In [19]:
#adding total scores


# Method 1: Using direct column names (most straightforwar
data['total_score'] = data['industry_score'] + data['location_score'] + data['role_score'] + data['number_of_visit_score'] + data['visit_duration_score'] + data['employee_count_score'] + data['contact_score'] + data['last_visit_score']

In [20]:

# Your desired column order
new_order = ['last_visited_date','days_since_last_visit', 'total_score', 'Domain', 'Name', 'revenue_in_million',
             'employee_count', 'contact_email', 'mobile_number', 'position',
             'time_spent_on_site', 'linkedin_profile', 'visit_count', 'location',
             'department', 'Seniority', 'Industry', 'Founded Year', 'contact_status',
             'industry', 'company_name', 'company_linkedin', 'lead_id', 'lead_url',
             'industry_score', 'location_score', 'role_score', 'number_of_visit_score',
             'visit_duration_score', 'employee_count_score', 'contact_score', 'last_visit_score']

# Method 1: Using reindex (recommended)
data = data.reindex(columns=new_order)

In [21]:
data.to_csv(output_url, index=False)
column_list = data.columns.tolist()
