In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('../data/github_final.csv')

# Preprocessing
# For simplicity, let's exclude text-based features and focus on numeric ones and 'license' as a categorical feature
df['license'] = LabelEncoder().fit_transform(df['license'])  # Convert 'license' to numeric
features = ['license', 'days_since_created', 'forks_day', 'stars_day']
X = df[features]
y = df['stargazers_count']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")


Mean Squared Error: 17481221.651532706
Root Mean Squared Error: 4181.055088315951


## Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../data/github_final.csv')

# Preprocessing
# For simplicity, let's exclude text-based features and focus on numeric ones and 'license' as a categorical feature
features = ['forks_count', 'days_since_created']
X = df[features]
y = df['stargazers_count']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred = linear_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")


Mean Squared Error: 66075592.73115545
Root Mean Squared Error: 8128.689484237631


In [16]:
# Assuming the Linear Regression model is already trained (linear_regressor)

# Get the coefficients and the intercept
coefficients = linear_regressor.coef_
intercept = linear_regressor.intercept_

# Display the coefficients
print("Coefficients:")
for feature, coef in zip(features, coefficients):
    print(f"{feature}: {coef}")

# Display the intercept
print(f"\nIntercept: {intercept}")


Coefficients:
forks_count: 1.9459109661372487
days_since_created: 1.0631651295619904

Intercept: 3512.4868207386025


### Interpretation:
**Coefficients:** Each coefficient tells you the expected change in the stargazers_count (popularity) for a one-unit increase in the respective feature, assuming all other features remain constant. For instance, if the coefficient for forks_day is 200, it means that for each additional fork per day, the model predicts an increase of 200 in the stargazers_count, all else being equal.

Intercept: The intercept represents the model's prediction for the target variable when all the features are zero. In many contexts, the intercept might not have a practical interpretation (e.g., it's unlikely all features would be zero), but it's a necessary part of the model to align the regression line with the data.

Understanding these coefficients and the intercept helps in deciphering how each feature influences the prediction, providing a clearer picture of the data's underlying patterns. This interpretability is one of the main advantages of Linear Regression, allowing stakeholders to make informed decisions based on the model's insights.