In [25]:
import pandas as pd

def extract_primary_address_part(address):
    """Extract the primary part of the address, i.e., everything before the first comma, and normalize it."""
    if pd.isna(address):
        return ""
    primary_part = address.split(',')[0].strip().lower()  # Split the address at the first comma and normalize
    return primary_part

# Load the data from both CSV files
customers = pd.read_csv('770 blank.csv', encoding='utf-8')
calendar = pd.read_csv('Calendar Dump.csv', encoding='ISO-8859-1')  # Adjust encoding if necessary

# Normalize the addresses in both dataframes to extract the primary part
customers['Normalized Address'] = customers['Service Address'].apply(extract_primary_address_part)
calendar['Normalized Location'] = calendar['Location'].apply(extract_primary_address_part)

# Merge the dataframes based on normalized addresses
merged_data = pd.merge(customers, calendar, left_on='Normalized Address', right_on='Normalized Location', how='inner')

# Select only the required columns for the output
output_data = merged_data[['Account Name', 'Normalized Address', 'Guestlist']]

# Save the result to a new CSV file
output_data.to_csv('matched_calendar_entries111.csv', index=False)

print("Matched calendar entries have been saved to 'matched_calendar_entries.csv'.")


Matched calendar entries have been saved to 'matched_calendar_entries.csv'.


In [31]:
import pandas as pd
import re

# Load the data from the CSV file
df = pd.read_csv('matched_calendar_entries111.csv')

def extract_emails(text):
    # Ensure the input is a string
    if pd.isna(text):
        return []
    # Extract email addresses using regex
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return emails

# Apply the extraction function to the 'Guestlist' or relevant column
df['Emails'] = df['Guestlist'].apply(extract_emails)

# Find the maximum number of emails in any single row to determine the number of columns needed
max_emails = df['Emails'].str.len().max()

# Create new columns for each email
for i in range(max_emails):
    df[f'Email_{i+1}'] = df['Emails'].apply(lambda x: x[i] if i < len(x) else None)

# Drop the original 'Guestlist' and temporary 'Emails' columns
df.drop(columns=['Guestlist', 'Emails'], inplace=True)

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_matched_calendar_entries.csv', index=False)

print(df)
print("Cleaned emails have been saved to 'cleaned_matched_calendar_entries.csv'.")


         Account Name     Normalized Address  \
0        John Pharaon      1464 anton square   
1        John Pharaon      1464 anton square   
2         Bo Lindgren  118 campbell crescent   
3           Sadaf Rai         6932 bilbao ln   
4     JENNIFER GORDON    14 lloyd george ave   
..                ...                    ...   
312       Sean Coster  1958 forest valley dr   
313  KAYA TACHE-GREEN        269 sterling rd   
314    Michael Caplan       129 invermay ave   
315   Hagop Barounian   10 valencia crescent   
316        Jerry Wang       15 flax field ln   

                                    Email_1  \
0                       jepharaon@gmail.com   
1                       jepharaon@gmail.com   
2                      aek.rsi647@gmail.com   
3                     ahsanrai.ar@gmail.com   
4                     jgordon88@hotmail.com   
..                                      ...   
312       nextlevelconsulting.ali@gmail.com   
313                   kayaveassaf@gmail.com   


In [35]:
import pandas as pd

# Load the data from the CSV file
df = pd.read_csv('cleaned_matched_calendar_entries.csv')

# Define a function to clean cell data
def clean_cell_content(cell):
    if isinstance(cell, str):  # Ensure that the cell contains text
        if 'nextlevel' in cell.lower() or 'nxtlevel' in cell.lower():
            return ''  # Replace content if it contains the specified substrings
        return cell
    return cell

# Apply the cleaning function to all columns in the DataFrame
for column in df.columns:
    df[column] = df[column].apply(clean_cell_content)

# Save the cleaned data back to a CSV file
df.to_csv('cleaned_matched_calendar_entries.csv', index=False)

print("The cleaned data has been saved back to 'cleaned_matched_calendar_entries.csv'.")


The cleaned data has been saved back to 'cleaned_matched_calendar_entries.csv'.


In [43]:
import pandas as pd

# Load the data from the CSV file
df = pd.read_csv('cleaned_matched_calendar_entries.csv')

# Assume the email columns are named 'Email_1', 'Email_2', 'Email_3', etc.
# Adjust the list of columns based on your actual data structure
email_columns = [col for col in df.columns if 'Email' in col]

# Collect all emails into a single Series, ignoring NaN values
all_emails = pd.concat([df[col].dropna() for col in email_columns]).reset_index(drop=True)

# Drop duplicates to get unique emails
unique_emails = all_emails.drop_duplicates().reset_index(drop=True)

# Create a new DataFrame to save the result
email_df = pd.DataFrame(unique_emails, columns=['Unique_Emails'])

# Save the unique emails to a new CSV file
email_df.to_csv('unique_emails.csv', index=False)

print("Unique emails have been saved to 'unique_emails.csv'.")


Unique emails have been saved to 'unique_emails.csv'.
