In [0]:
# %pip install transformers
# %pip install torch
# %pip install TensorFlow
# %pip install flask
# %pip install plotly
%pip install PyPI

In [0]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RealEstate").getOrCreate()

# Load data
real_estate_df = spark.sql(
    """
    SELECT 
      to_date(TIMESTAMP) DATE_DAY,
      CITY,
      STATE,
      CASE WHEN LEN(ZIPCODE) < 5 THEN lpad(ZIPCODE, 5, "0") ELSE ZIPCODE END AS ZIPCODE,
      HOMESTATUS, 
      AVG(TRY_CAST(BEDROOMS AS DOUBLE)) AVERAGE_BEDROOMS,
      AVG(TRY_CAST(BATHROOMS AS DOUBLE)) AVERAGE_BATHROOMS,
      AVG(TRY_CAST(LONGITUDE AS DOUBLE)) LONGITUDE,
      AVG(TRY_CAST(LATITUDE AS DOUBLE)) LATITUDE,
      AVG(coalesce(CAST(PRICE AS DOUBLE), 0)) AS AVERAGE_PRICE
    FROM `bright_data_real_estate_listings`.`datasets`.`zillow_properties`
    GROUP BY ALL
    """
)

# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Data cleaning and preprocessing
real_estate_df_clean = real_estate_df.dropna()  # Example: Drop rows with missing values


In [0]:
import plotly.express as px

# Example dataframe for plotting
df_plot = real_estate_df_clean.toPandas()

fig = px.scatter_mapbox(df_plot, lat="LATITUDE", lon="LONGITUDE", color="AVERAGE_PRICE",
                        size="AVERAGE_PRICE", color_continuous_scale=px.colors.cyclical.IceFire,
                        size_max=20, zoom=2.5, mapbox_style="carto-positron")

# Set title
fig.update_layout(title="Real Estate Prices")

# Control dimensions
fig.update_layout(
    title="Real Estate Prices",
    title_x=0.5,  # Center title
    title_font_size=24,  # Change title size
    title_font_color="white",  # Update title text color
    paper_bgcolor="black",  # Set background color
    legend_font_color="white",  # Set legend text color to white
    font_color="white"  # Set all font color to white
)

# Rename the display name of a column in the figure
fig.update_traces(marker=dict(symbol='circle', opacity=0.8), name="Average Price")

# Control dimensions
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
)

fig.show()

In [0]:
# The EverOrderedLiingo is the dependent variable we wish to determine how it is impacted by the 
from sklearn.model_selection import train_test_split

X = real_estate_df_clean.drop("pricePerSquareFoot")
y = real_estate_df_clean.pricePerSquareFoot

# Split out the training data
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)

# Split the remaining data equally into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)

In [0]:
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import cloudpickle
import time

# The predict method of sklearn's RandomForestClassifier returns a binary classification (0 or 1). 
# The following code creates a wrapper function, SklearnModelWrapper, that uses 
# the predict_proba method to return the probability that the observation belongs to each class. 

class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
  def __init__(self, model):
    self.model = model
    
  def predict(self, context, model_input):
    return self.model.predict_proba(model_input)[:,1]

# mlflow.start_run creates a new MLflow run to track the performance of this model. 
# Within the context, you call mlflow.log_param to keep track of the parameters used, and
# mlflow.log_metric to record metrics like accuracy.
with mlflow.start_run(run_name='untuned_random_forest'):
  n_estimators = 10
  model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))
  model.fit(X_train, y_train)

  # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]
  predictions_test = model.predict_proba(X_test)[:,1]
  auc_score = roc_auc_score(y_test, predictions_test)
  mlflow.log_param('n_estimators', n_estimators)
  # Use the area under the ROC curve as a metric.
  mlflow.log_metric('auc', auc_score)
  wrappedModel = SklearnModelWrapper(model)
  # Log the model with a signature that defines the schema of the model's inputs and outputs. 
  # When the model is deployed, this signature will be used to validate inputs.
  signature = infer_signature(X_train, wrappedModel.predict(None, X_train))
  
  # MLflow contains utilities to create a conda environment used to serve models.
  # The necessary dependencies are added to a conda.yaml file which is logged along with the model.
  conda_env =  _mlflow_conda_env(
        additional_conda_deps=None,
        additional_pip_deps=["cloudpickle=={}".format(cloudpickle.__version__), "scikit-learn=={}".format(sklearn.__version__)],
        additional_conda_channels=None,
    )
  mlflow.pyfunc.log_model("random_forest_model", python_model=wrappedModel, conda_env=conda_env, signature=signature)

In [0]:
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)