In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
funding_rounds_df = pd.read_csv('/content/drive/MyDrive/Series A-nalysts/archive/funding_rounds.csv')
objects_df = pd.read_csv('/content/drive/MyDrive/Series A-nalysts/archive/objects.csv')

  objects_df = pd.read_csv('/content/drive/MyDrive/Series A-nalysts/archive/objects.csv')


In [None]:
objects_df.keys()

Index(['id', 'entity_type', 'entity_id', 'parent_id', 'name',
       'normalized_name', 'permalink', 'category_code', 'status', 'founded_at',
       'closed_at', 'domain', 'homepage_url', 'twitter_username', 'logo_url',
       'logo_width', 'logo_height', 'short_description', 'description',
       'overview', 'tag_list', 'country_code', 'state_code', 'city', 'region',
       'first_investment_at', 'last_investment_at', 'investment_rounds',
       'invested_companies', 'first_funding_at', 'last_funding_at',
       'funding_rounds', 'funding_total_usd', 'first_milestone_at',
       'last_milestone_at', 'milestones', 'relationships', 'created_by',
       'created_at', 'updated_at'],
      dtype='object')

###Now we merge the objects_df which has all the companies' information and funding_rounds_df which has the information for funding rounds for each company into one dataframe

In [None]:
full_df = pd.merge(objects_df, funding_rounds_df, left_on="id", right_on="object_id", how="inner")

### This is a simplification of our data and we can quickly see what to expect from our data.

In [None]:
full_df_simplified = full_df[['id_x',  'founded_at', 'funding_rounds', 'funded_at', 'raised_amount_usd',  'is_last_round']]

###Here we fill na fields with the median and convert dates to datetime format. We also add columns such as company age at funding which is a numerical value. This allows our models to analyze the datetime information.

In [None]:
# Sort the data
full_df_simplified = full_df_simplified.sort_values(by=['id_x', 'funded_at'])
full_df_simplified = full_df_simplified[full_df_simplified['raised_amount_usd'] < 40000000]

# Calculate cumulative sum of raised_amount_usd and current number of funding rounds
full_df_simplified['total_funding_until_this_point'] = full_df_simplified.groupby('id_x')['raised_amount_usd'].cumsum()
full_df_simplified['current_number_of_funding_rounds'] = full_df_simplified.groupby('id_x').cumcount() + 1

# Convert dates to datetime and create time features
full_df_simplified['founded_at'] = pd.to_datetime(full_df_simplified['founded_at'])
full_df_simplified['funded_at'] = pd.to_datetime(full_df_simplified['funded_at'])
full_df_simplified['company_age_at_funding'] = (full_df_simplified['funded_at'] - full_df_simplified['founded_at']).dt.days
full_df_simplified['time_since_last_round'] = full_df_simplified.groupby('id_x')['funded_at'].diff().dt.days.fillna(0)

# Shift target variable and drop rows where target is NaN
full_df_simplified['next_round_raised_amount_usd'] = full_df_simplified.groupby('id_x')['raised_amount_usd'].shift(-1)
full_df_simplified = full_df_simplified.dropna(subset=['next_round_raised_amount_usd'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df_simplified['total_funding_until_this_point'] = full_df_simplified.groupby('id_x')['raised_amount_usd'].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df_simplified['current_number_of_funding_rounds'] = full_df_simplified.groupby('id_x').cumcount() + 1


###Drop columns that have really low correlations or aren't used for prediction or that we repurposed into new columns

In [None]:
full_df_simplified.drop(columns=['id_x', 'is_last_round', 'founded_at', 'funded_at'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df_simplified.drop(columns=['id_x', 'is_last_round', 'founded_at', 'funded_at'], inplace=True)


###Scale numerical columns

In [None]:
from sklearn.preprocessing import StandardScaler

# Columns to scale
numerical_cols = ['raised_amount_usd', 'total_funding_until_this_point', 'company_age_at_funding', 'time_since_last_round']

# Standardizing these columns
scaler = StandardScaler()
full_df_simplified[numerical_cols] = scaler.fit_transform(full_df_simplified[numerical_cols])

# Fill missing values with the median
full_df_simplified = full_df_simplified.fillna(full_df_simplified.median())


###Filtering of data on round raised amount

In [None]:
full_df_simplified = full_df_simplified[full_df_simplified['current_number_of_funding_rounds'] < 4]

full_df_simplified_less = full_df_simplified[full_df_simplified['next_round_raised_amount_usd'] < 20000000]
full_df_simplified_more = full_df_simplified[full_df_simplified['next_round_raised_amount_usd'] >= 20000000]

In [None]:
full_df_simplified

Unnamed: 0,funding_rounds,raised_amount_usd,total_funding_until_this_point,current_number_of_funding_rounds,company_age_at_funding,time_since_last_round,next_round_raised_amount_usd
0,3,0.011063,-0.302527,1,-0.804402,-0.582429,9500000.0
1,3,0.617832,0.318742,2,-0.539209,0.813346,25000000.0
5,5,-0.452937,-0.515066,1,-0.577508,-0.582429,9000000.0
6,5,0.546448,0.073504,2,-0.171304,1.555520,2069200.0
9,5,-0.443057,0.208823,3,0.106076,0.877485,12000000.0
...,...,...,...,...,...,...,...
52611,5,-0.695645,-0.626241,1,-0.283300,-0.582429,1200000.0
52612,5,-0.567152,-0.547765,2,-0.023329,0.785858,6000000.0
52613,5,0.118140,-0.155384,3,0.205306,0.620931,15000000.0
52620,2,-0.652814,-0.606622,1,-0.417928,-0.582429,500000.0


---



---

# END OF DATA PREPROCESSING

---



---



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

test_set, discard_set = train_test_split(full_df_simplified_less, test_size=0.2, random_state=42)

# Split the other dataset into training and validation sets
train_set, validation_set = train_test_split(full_df_simplified_more, test_size=0.2, random_state=42)

# Combine the training set with the discard set
final_train_set = pd.concat([train_set, discard_set])

X_train = final_train_set.drop('next_round_raised_amount_usd', axis=1)
Y_train = final_train_set['next_round_raised_amount_usd']

# And for the test_set
X_test = test_set.drop('next_round_raised_amount_usd', axis=1)
Y_test = test_set['next_round_raised_amount_usd']

# Train the model on the new training set
model = LinearRegression()
model.fit(X_train, Y_train)

import pandas as pd

# Define the data for the DataFrame
data = {
    'funding_rounds': [4],
    'raised_amount_usd': [40000000],
    'total_funding_until_this_point': [65000000],
    'current_number_of_funding_rounds': [2],
    'company_age_at_funding': [900],
    'time_since_last_round': [372],
    'next_round_raised_amount_usd': [80000000]
}

# Create the DataFrame
df = pd.DataFrame(data)
df_test = df.drop('next_round_raised_amount_usd', axis=1)
# Display the DataFrame
print(df)


# Predicting on test data
y_pred = model.predict(df_test)
print(y_pred)


   funding_rounds  raised_amount_usd  total_funding_until_this_point  \
0               4           40000000                        65000000   

   current_number_of_funding_rounds  company_age_at_funding  \
0                                 2                     900   

   time_since_last_round  next_round_raised_amount_usd  
0                    372                      80000000  
[2.97315904e+14]


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f"RMSE: {rmse}")

# Viewing coefficients to understand feature importance
feature_importance = pd.DataFrame(model.coef_, index=X_train.columns, columns=['Coefficient'])
print(feature_importance)


In [None]:
residuals = abs(Y_test - y_pred)
residuals.describe()

In [None]:
residuals.hist(bins=15)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the percentage difference
percentage_diff = abs((Y_test - y_pred) / Y_test) * 100

# Categorize into buckets
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels = ['0-5%', '5-10%', '10-15%', '15-20%', '20-25%', '25-30%', '30-35%', '35-40%', '40-45%', '45-50%', '50-55%', '55-60%', '60-65%', '65-70%', '70-75%', '75-80%', '80-85%', '85-90%', '90-95%', '95-100%']
percentage_diff_binned = pd.cut(percentage_diff, bins=bins, labels=labels, include_lowest=True)

# Plot histogram
plt.figure(figsize=(10,6))
percentage_diff_binned.value_counts(sort=False).plot(kind='bar')
plt.xlabel('Percentage Difference Buckets')
plt.ylabel('Frequency')
plt.title('Histogram of Percentage Difference between Predicted and Actual Values')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Predicted vs Actual Values Plot
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, y_pred, alpha=0.5)
plt.title('Predicted vs Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=4)
plt.show()

# Residual Plot
residuals = Y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(Y_test, residuals, alpha=0.5)
plt.title('Residuals vs Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Residuals')
plt.hlines(y=0, xmin=Y_test.min(), xmax=Y_test.max(), colors='red', linestyles='--')
plt.show()

# Residual Histogram
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, alpha=0.7, color='blue')
plt.title('Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()
