In [1]:
import pandas as pd

Load the dataset into a Pandas DataFrame:

In [2]:
df = pd.read_csv("deal_data.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'SourceFundingID', 'Source', 'Domain', 'SourceCompanyID',
       'PostMoneyValuation', 'Type', 'SourceType', 'Amount', 'Original_Date',
       'Date', 'MonthUnknown', 'CreatedAt', 'UpdatedAt', 'VentureStage',
       'Type_combined', 'CurrencyCode', 'ValuationCurrencyCode', 'ID', 'Tech',
       'Agency Consultancy', 'Ecosystem', 'GoogleCountry', 'GoogleCity',
       'Sectors', 'Region', 'FormationDate', 'Amount_ExchangeRate',
       'Amount_USD', 'PostMoneyValuationUSD'],
      dtype='object')

Define a function to clean and standardize the funding round types

In [4]:
def clean_funding_round(type_combined):
    type_combined = type_combined.lower()  # Convert to lowercase
    type_combined = type_combined.replace('venture', 'series')  # Replace 'venture' with 'series'
    return type_combined


Apply the clean_funding_round function to the 'Type_combined' column

In [5]:
df['Type_combined'] = df['Type_combined'].apply(clean_funding_round)


Define the priority order for sources

In [6]:
source_priority = {
    'Pitchbook': 1,
    'Crunchbase': 2,
    'Dealroom': 3
    
}


Sort the DataFrame by 'Original_Date' and source priority

In [7]:
df['SourcePriority'] = df['Source'].map(source_priority)
df.sort_values(by=['Original_Date', 'SourcePriority'], ascending=[True, True], inplace=True)


Deduplicate the funding rounds based on the deal span of 190 days

In [8]:
# Define a threshold for deal span (in days)
deal_span_threshold = 190

# Create a mask to identify duplicate funding rounds
mask = df.duplicated(subset=['Type_combined', 'Original_Date'], keep='first') & df['SourcePriority'].duplicated(keep='first')

# Filter out duplicate funding rounds
df = df[~mask]


Remove the 'SourcePriority' column

In [9]:
df = df.drop(columns=['SourcePriority'])


In [11]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,SourceFundingID,Source,Domain,SourceCompanyID,PostMoneyValuation,Type,SourceType,Amount,Original_Date,...,Agency Consultancy,Ecosystem,GoogleCountry,GoogleCity,Sectors,Region,FormationDate,Amount_ExchangeRate,Amount_USD,PostMoneyValuationUSD
7541,7541,e578d476-e42a-47a3-aa22-a905633fa840,Crunchbase,,4f7b0e5f-4171-4ee7-ac11-e20730d4e370,,seed,pre seed,250000.0,1919-01-01 00:00:00+00:00,...,,,,,,,,1.0,250000.0,
4629,4629,71698dfe-767e-4756-a1b1-01253d9f04c1,Crunchbase,,14bf5c0a-b5d4-48e4-b600-cec9327d54ef,,seed,seed,,1982-01-01 00:00:00+00:00,...,,,,,,,,,,
5374,5374,8cfbfce9-a8d5-4f46-b068-e2d672797495,Crunchbase,,ec3fc2a9-50fd-4f75-b667-6b8df9540023,,venture,venture,,1986-01-01 00:00:00+00:00,...,,,,,,,,,,
4284,4284,63ebc69d-2559-4198-a73d-d672a9348acd,Crunchbase,,e2b72bc9-7bea-46ce-b220-6963e75bb9b6,,venture,venture,125000.0,1987-01-01 00:00:00+00:00,...,,,,,,,,1.0,125000.0,
5381,5381,8d537add-cd07-4b01-aa12-c9de8ca94e1c,Crunchbase,,3a40bd8c-8e23-4697-8815-02e7908162d1,,seed,seed,,1987-01-01 00:00:00+00:00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8617,8617,129693,Dealroom,lexiquest.com,1422715,,venture,series c,25000000.0,2000-06-01 00:00:00+00:00,...,False,New York City,United States,New York,EdTech,Northern America,1997-01-01 00:00:00+00:00,1.0,25000000.0,
8858,8858,163508,Dealroom,netformx.com,78667,,venture,series b,15000000.0,2000-06-01 00:00:00+00:00,...,False,Silicon Valley,United States,San Jose,FinTech,Northern America,1994-01-01 00:00:00+00:00,1.0,15000000.0,
7278,7278,dad4cee2-e436-772e-f678-c367b5cddb4d,Crunchbase,altoweb.com,2eafb1fe-38a6-7778-bc1d-72c7ee43de12,,venture,venture b,,2000-06-09 00:00:00+00:00,...,False,Silicon Valley,United States,Palo Alto,EdTech,Northern America,1999-01-01 00:00:00+00:00,,,
3788,3788,5181cc29-2d67-6e0d-b576-7ce418c1aade,Crunchbase,ameranth.com,bd2f88e5-5b43-fa90-f32b-0419988b462e,,seed,seed,,2000-06-20 00:00:00+00:00,...,False,San Diego,United States,San Diego,EdTech;Digital Media,Northern America,1996-01-01 00:00:00+00:00,,,


Save the deduplicated DataFrame to a new CSV file

In [12]:
df.to_csv('deduplicated_funding_data.csv', index=False)