In [172]:
import pandas as pd
import numpy as np
import re

data_path = '/Users/tony/Desktop/outbound/data/master_data.csv'
out_put_path = '/Users/tony/Desktop/outbound/data/cleaned_data.csv'

data = pd.read_csv(data_path)

#function to remove unwanted columns


def remove_unwanted_columns(data):
    columns_to_keep = [
        'Domain', 'Name', 'Email', 'Cell/Mobile Number', 'Job Title', 'Personal Linkedin',
        'Duration', 'Number of Visits', 'Visitor Location', 'Department', 'Seniority',
        'Industry', 'Founded Year', 'Estimated Annual Revenue', 'Type of contact',
        'Employees Range', 'Sector', 'company.name', 'Company.domain', 'Company.sector',
        'Company.industry', 'Company.country', 'company.linkedinPage', 'company.lastVisit',
        'leadId', 'leadLink', 'Summary.visits', 'Summary.duration',
        'Summary.utm_source', 'Summary.utm_medium'
    ]
    
    return data[[col for col in columns_to_keep if col in data.columns]]

# Apply function to data
data = remove_unwanted_columns(data)

In [173]:
#rename columns to better fit the needs
def rename_columns(data):
    """Renames columns in the given DataFrame."""
    column_renames = {
        "Domain Name": "website",
        "Email": "contact_email",
        "Cell/Mobile Number": "mobile_number",
        "Job Title": "position",
        "Personal Linkedin": "linkedin_profile",
        "Duration": "visit_duration",
        "Number of Visits": "visit_count",
        "Visitor Location": "location",
        "Department": "department",
        "Estimated Annual Revenue": "revenue_range",
        "Type of contact": "contact_status",
        "Employees Range": "employee_count_range",
        "Sector": "industry",
        "company.name": "company_name",
        "company.linkedinPage": "company_linkedin",
        "company.lastVisit": "last_visit_date",
        "leadId": "lead_id",
        "leadLink": "lead_url",
        "summary.leadScore": "lead_score"
    }
    
    return data.rename(columns=column_renames)

data = rename_columns(data)


In [174]:
#convert visit duration "visit_duration" to seconds

def convert_duration_to_seconds(data, column_name, new_column_name):
    def duration_to_seconds(duration):
        if isinstance(duration, str) and ':' in duration:
            h, m, s = map(int, duration.split(':'))
            return h * 3600 + m * 60 + s
        return 0  # Default for invalid or missing data
    
    data[new_column_name] = data[column_name].apply(duration_to_seconds)
    return data

# Apply function with the new column name "time_spent_on_site"
data = convert_duration_to_seconds(data, "visit_duration", "time_spent_on_site")
data = data.drop(columns=["visit_duration"])

In [175]:
def process_and_create_revenue_column(data, input_column="revenue_range", output_column="revenue"):
    """Standardizes revenue and creates a new column in the DataFrame."""

    def standardize_annual_revenue(revenue_range):
        """Standardizes a revenue range string, returning the higher value with its suffix."""

        if not isinstance(revenue_range, str):
            return "0"  # Handle non-string input

        match = re.match(r"\$([\d.]*[KM]?)[\s-]*\$([\d.]*[KM]?)", revenue_range, re.IGNORECASE)
        if match:
            high_revenue = match.group(2).upper()
            return high_revenue

        return "0"  # Default for invalid format

    data[output_column] = data[input_column].apply(standardize_annual_revenue)
    return data

data = process_and_create_revenue_column(data) # Process and create 'revenue' column
#drop the original revenue column
data = data.drop(columns=["revenue_range"])


In [176]:
def convert_revenue_to_millions(revenue_str):
    """Converts a revenue string (e.g., '5M', '100K', '1B') to millions."""

    if isinstance(revenue_str, (int, float)): # handles if the input is a number already
        return revenue_str

    if not isinstance(revenue_str, str):  # Handle non-string input
        return 0

    revenue_str = revenue_str.upper()  # Handle lowercase (k, m, b)

    match = re.match(r"([\d.]+)([KM]?)", revenue_str)  # Improved regex
    if match:
        amount = float(match.group(1))
        multiplier = match.group(2)

        if multiplier == 'K':
            return amount / 1000
        elif multiplier == 'M':
            return amount
        elif multiplier == 'B':
            return amount * 1000
        else: # handles if there is no multiplier present (e.g. 1000000)
          try:
            return float(revenue_str)/1000000
          except ValueError:
            return 0
    return 0  # Default for invalid format



def create_revenue_in_millions_column(data, input_column="revenue", output_column="revenue_in_million"):
    """Creates a new 'revenue_in_million' column."""
    data[output_column] = data[input_column].apply(convert_revenue_to_millions)
    return data


data = create_revenue_in_millions_column(data)
data = data.drop(columns=['revenue'])


In [177]:
def convert_employee_range(employee_range):
    """Converts an employee count range string to a single number (higher value or integer)."""

    if pd.isna(employee_range):  # Check for NaN using pandas.isna()
        return 0  # Return 0 for NaN values

    if isinstance(employee_range, (int, float)):  # Check if it's already a number
        return int(employee_range)

    if not isinstance(employee_range, str):
        return 0  # Handle other non-string/non-numeric input

    match = re.search(r"- (\d+(?:,\d+)?)", employee_range)
    if match:
        try:
            count = int(match.group(1).replace(",", ""))
            return count
        except ValueError:
            return 0
    else:
        try:
            count = int(employee_range.replace(",", ""))
            return count
        except ValueError:
            return 0

def create_employee_count_column(data, input_column="employee_count_range", output_column="employee_count"):
    """Creates a new 'employee_count' column."""
    data[output_column] = data[input_column].apply(convert_employee_range)
    return data

data = create_employee_count_column(data)
data = data.drop(columns=["employee_count_range"])



In [178]:


def extract_date(timestamp):
    
    """Extracts the date (YYYY-MM-DD) from a timestamp string."""
    if not isinstance(timestamp, str):  # Handle non-string input (including NaT)
        return None

    match = re.match(r"(\d{4}-\d{2}-\d{2})", timestamp)  # Improved Regex
    if match:
        return match.group(1)
    return None  # Return None for invalid timestamps

def create_last_visited_date_column(data, input_column="last_visit_date", output_column="last_visited_date"):
    """Creates a new 'last_visited_date' column."""
    data[output_column] = data[input_column].apply(extract_date)
    return data


data = create_last_visited_date_column(data)
data = data.drop(columns=['last_visit_date'])



In [179]:
#rearrage column order

def reorder_columns(data, desired_order):
    """Reorders columns in a DataFrame."""
    # Ensure all desired columns are present (handle missing columns):
    for col in desired_order:
        if col not in data.columns:
            data[col] = None  # Or some other default value like "" or np.nan

    # Reorder the columns (handles potential KeyError if column is not present):
    existing_columns = [col for col in desired_order if col in data.columns]
    data = data[existing_columns]
    return data

# Your desired column order:
desired_order = ['last_visited_date', 'Domain', 'Name', 'revenue_in_million', 'employee_count', 'contact_email', 'mobile_number', 'position', 'time_spent_on_site', 'linkedin_profile', 'visit_count', 'location', 'department', 'Seniority', 'Industry', 'Founded Year', 'contact_status', 'industry', 'company_name', 'company_linkedin', 'lead_id', 'lead_url']

data = reorder_columns(data, desired_order)


In [None]:
import pandas as pd

def calculate_visits_score(num_visits):
    """Calculates a visits score based on the number of visits."""
    if num_visits >= 20:
        return 50
    elif num_visits >= 10:
        return 40
    elif num_visits >= 6:
        return 30
    elif num_visits >= 4:
        return 20
    elif num_visits >= 2:
        return 10
    else:
        return 2

def add_visits_score_column(data, visits_column="Number of Visits", score_column="visits_score"): # uses data
    """Adds a visits score column to the DataFrame."""
    data[score_column] = data[visits_column].apply(calculate_visits_score) # uses data
    return data


data = add_visits_score_column(data)  # Using default column names
print(data)

data = add_visits_score_column(data, visits_column="Number of Visits", score_column="visitor_visits_score") # Example with different column names
print(data)

In [None]:
print(data.columns.tolist())
data.to_csv(out_put_path, index=False)

#print column names


['last_visited_date', 'Domain', 'Name', 'revenue_in_million', 'employee_count', 'contact_email', 'mobile_number', 'position', 'time_spent_on_site', 'linkedin_profile', 'visit_count', 'location', 'department', 'Seniority', 'Industry', 'Founded Year', 'contact_status', 'industry', 'company_name', 'company_linkedin', 'lead_id', 'lead_url']
