#    ***                                                                                 Main Project Notebook    ***



## Spotchecking Models :

##### Baseline Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score

# Separate numeric and categorical features
numeric_features = ['funding_rounds', 'milestones', 'relationships', 'tracking_interval_days']
categorical_features = ['continent', 'funding_total_bin']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the RandomForestClassifier
classifier = RandomForestClassifier(random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', classifier)])



# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict using the pipeline
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

#### Binary Classification Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf = RandomForestClassifier(random_state=43, class_weight= 'balanced' )

# Separate numeric and categorical features
numeric_features = ['funding_rounds', 'milestones', 'relationships', 'tracking_interval_days']
categorical_features = ['continent', 'funding_total_bin']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf)])



# Fit the pipeline on the training data
pipeline.fit(X2_train, y2_train)

# Predict using the pipeline
y_pred_binary_classif = pipeline.predict(X2_test)

#### Random Bagging with SMOTE

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.pipeline import Pipeline


# Assuming you have defined X and y
# Split your data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Specify categorical and numerical feature columns
categorical_features = ['continent', 'high_level_category']
numerical_features = ['funding_rounds', 'milestones', 'relationships', 'product_count', 'office_count', 'Company_degree_count', 'funding_total_usd']

# Create a column transformer with OneHotEncoder and StandardScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep other features as they are
)

# Create the SMOTE-Bagging classifier
smote_bagging = BalancedBaggingClassifier(sampler=SMOTE())

# Create a pipeline
pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', smote_bagging)
])

# Perform cross-validation on the training set
#cv_results = cross_validate(pipeline, X_train_encoded, y_train, scoring="balanced_accuracy")

# Print the mean and standard deviation of cross-validation results
#print(f"Cross-Validation Balanced Accuracy: {cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}")




# Fit the pipeline on the entire training set
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the separate test set
test_score = pipeline.score(X_test, y_test)
print(f"Test Set Balanced Accuracy: {test_score:.3f}")

#### Gradient Histogram Boosting Classifier with Random Undersampling

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import numpy as np

# Define the columns to encode and scale
columns_to_encode = ['continent', 'high_level_category']
columns_to_scale = ['funding_rounds', 'milestones', 'relationships',
       'product_count', 'office_count','Company_degree_count', 'funding_total_usd', 'tracking_interval_days']  

# Create a column transformer with OneHotEncoder, StandardScaler, and other transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), columns_to_encode),
        ('num', StandardScaler(), columns_to_scale)
        
    ],
    
)

# Apply RandomUnderSampler to the training data
rus = RandomUnderSampler(random_state=0)

# Create a pipeline with preprocessing, resampling, and model
model = Pipeline([
    ('preprocess', preprocessor),
    ('resample', rus),
    ('classifier', HistGradientBoostingClassifier(random_state=0))
])

# Fit the pipeline on the training data
model.fit(X_train, y_train)


In [None]:
# also tried Adasyn with GradientHistogramBoosting wich had the least favourable results for G-Mean , so excluded it from this notebook.
# tested the above models tweaking the features each time to include different subsets 
# best feature set was with GHist + UNderSampling , but did not attempt every possible permutation 
