# load data

In [52]:
import pandas as pd
from google.cloud import bigquery
import plotly.express as px

# 1. Configuration (Uses the full View ID you provided)
PROJECT_ID = 'lap-coffee-476107'
DATASET_ID = 'lap_locations_final_merged'
VIEW_ID = 'lap_data_imputed'
FULL_VIEW_PATH = f"{PROJECT_ID}.{DATASET_ID}.{VIEW_ID}"

# 2. Initialize BigQuery client
# This assumes your environment is authenticated (e.g., via gcloud)
client = bigquery.Client(project=PROJECT_ID)

# 3. SQL Query to load all data from the imputed view
query = f"""
    SELECT *
    FROM `{FULL_VIEW_PATH}`
"""

print(f"Loading data from BigQuery view: {FULL_VIEW_PATH}")

# 4. Load data into a Pandas DataFrame
df = client.query(query).to_dataframe()

print("\n--- Data Loaded Successfully ---")
print(f"DataFrame shape: {df.shape}")
print("First 5 rows of imputed data:")
print(df.head())

Loading data from BigQuery view: lap-coffee-476107.lap_locations_final_merged.lap_data_imputed



BigQuery Storage module not found, fetch data with the REST endpoint instead.




--- Data Loaded Successfully ---
DataFrame shape: (4752, 20)
First 5 rows of imputed data:
         date        name        lat       lon  \
0  2025-01-01  LAP COFFEE  52.486768  13.35549   
1  2025-01-02  LAP COFFEE  52.486768  13.35549   
2  2025-01-03  LAP COFFEE  52.486768  13.35549   
3  2025-01-04  LAP COFFEE  52.486768  13.35549   
4  2025-01-05  LAP COFFEE  52.486768  13.35549   

                                   address  pm25_aod_proxy  \
0  Akazienstraße 3A, 10823 Berlin, Germany           0.060   
1  Akazienstraße 3A, 10823 Berlin, Germany           0.060   
2  Akazienstraße 3A, 10823 Berlin, Germany           0.060   
3  Akazienstraße 3A, 10823 Berlin, Germany           0.066   
4  Akazienstraße 3A, 10823 Berlin, Germany           0.061   

                      geometry weather_date  temp_max  temp_min  precip_mm  \
0  POINT (13.35549 52.4867684)   2025-01-01       6.9       0.5        0.0   
1  POINT (13.35549 52.4867684)   2025-01-02       6.8       1.2        4.0   


# data str

In [53]:
df.describe()

Unnamed: 0,lat,lon,pm25_aod_proxy,temp_max,temp_min,precip_mm,cafe_rating,cafe_user_ratings_total,ndvi,nightlight,elevation_m,parks_count_1km,open_bars_count_500m
count,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0,4752.0
mean,52.515041,13.399718,0.157979,16.395602,7.96713,1.707534,4.54375,129.1875,0.16791,38.150437,47.75,6.375,11.8125
std,0.021106,0.041308,0.07159,8.175408,6.72926,3.94133,0.264529,152.018553,0.087498,9.658211,7.395878,2.99771,4.41944
min,52.479598,13.320827,0.0295,-2.6,-13.4,0.0,4.0,18.0,-0.043728,22.35,38.0,1.0,6.0
25%,52.50042,13.391518,0.107214,11.1,2.6,0.0,4.375,47.0,0.119552,38.150437,41.0,4.75,8.0
50%,52.50821,13.412825,0.147385,17.3,8.9,0.1,4.7,97.0,0.16791,38.150437,44.5,7.0,10.0
75%,52.534887,13.420839,0.1942,22.2,13.725,1.7,4.725,139.0,0.194789,38.150437,54.0,7.0,14.0
max,52.54932,13.459298,0.404,37.6,20.6,44.3,4.9,689.0,0.573524,101.589996,62.0,13.0,22.0


# feature space

In [54]:
import pandas as pd
import plotly.express as px

# 1️⃣ Update df: add first word of address to name
df['name_updated'] = df['name'] + "_" + df['address'].str.split().str[0]

# 2️⃣ Ensure season is categorical
df['season'] = pd.Categorical(
    df['season'],
    categories=['Winter', 'Spring', 'Summer', 'Autumn'],
    ordered=True
)

# 3️⃣ Drop unnecessary columns if they exist
cols_to_drop = ['first_word', 'month', 'weather_date', 'cafe_place_id']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

# 4️⃣ Define features for plotting
features = {
    'ndvi': 'NDVI',
    'temp_max': 'Temperature (°C)',
    'pm25_aod_proxy': 'PM2.5 (AOD Proxy)',
    'nightlight': 'Nightlight Intensity'
}

# 5️⃣ Generate separate plots for each feature
for var, label in features.items():
    fig = px.scatter(
        df,
        x=var,
        y='name_updated',
        color='season',
        hover_data=['date', 'temp_max', 'pm25_aod_proxy', 'nightlight'],
        title=f'{label} by Season and Location',
    )

    fig.update_layout(
        height=600,
        legend_title_text='Season',
        title_font_size=18,
        margin=dict(t=60, l=80, r=40, b=40),
        xaxis_title=label,
        yaxis_title='Location (Name)',
    )

    fig.show()


# bars, parks

In [55]:
import plotly.express as px

# Scatter map for parks within 1 km
fig_parks = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    color="parks_count_1km",
    size="parks_count_1km",
    hover_name="name_updated",
    hover_data={"lat": True, "lon": True, "season": True},
    color_continuous_scale="Greens",
    size_max=20,
    zoom=11,
    title="Nearby Parks within 500m"
)

fig_parks.update_layout(
    mapbox_style="carto-positron",
    margin=dict(t=60, l=10, r=10, b=10)
)
fig_parks.show()

# Scatter map for open bars within 500 m
fig_bars = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    color="open_bars_count_500m",
    size="open_bars_count_500m",
    hover_name="name_updated",
    hover_data={"lat": True, "lon": True, "season": True},
    color_continuous_scale="Reds",
    size_max=20,
    zoom=11,
    title="Open Bars within 500 m"
)

fig_bars.update_layout(
    mapbox_style="carto-positron",
    margin=dict(t=60, l=10, r=10, b=10)
)
fig_bars.show()



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



# rfc

In [56]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# ----------------------------
# 1️⃣ Prepare features and target
# ----------------------------
response_var = "name_updated"
exclude_cols = ["date", "name", "lon", "lat", "address", "geometry", 'season', response_var]
X = df.drop(columns=exclude_cols)
y = df[response_var]

# Factorize object columns
for col in X.select_dtypes(include=['object']).columns:
    X[col], _ = pd.factorize(X[col])

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ----------------------------
# 2️⃣ Train Random Forest
# ----------------------------
rfc = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rfc.fit(X, y_encoded)

# ----------------------------
# 3️⃣ Feature Engineering Function
# ----------------------------
def feature_engineering(df_input):
    df_feat = df_input.copy()
    df_feat['temp_range'] = df_feat['temp_max'] - df_feat['temp_min']
    df_feat['temp_avg'] = (df_feat['temp_max'] + df_feat['temp_min']) / 2
    df_feat['rainy_day'] = (df_feat['precip_mm'] > 1).astype(float)
    df_feat['parks_bars_ratio'] = df_feat['parks_count_1km'] / (df_feat['open_bars_count_500m'] + 1e-5)
    df_feat['pm25_ndvi_ratio'] = df_feat['pm25_aod_proxy'] / (df_feat['ndvi'] + 1e-5)
    df_feat['pm25_nightlight'] = df_feat['pm25_aod_proxy'] * df_feat['nightlight']
    return df_feat

# ----------------------------
# 4️⃣ Mood-based Dataset Creation
# ----------------------------
def create_mood_df(df_input, mood='green'):
    df_mood = feature_engineering(df_input)
    
    if mood == 'green':
        df_mood = df_mood.sort_values(['ndvi', 'pm25_aod_proxy', 'parks_count_1km'], ascending=[False, True, False])
    elif mood == 'cozy':
        df_mood = df_mood.sort_values(['temp_avg', 'nightlight', 'open_bars_count_500m'], ascending=[True, True, True])
    elif mood == 'urban':
        df_mood = df_mood.sort_values(['nightlight', 'open_bars_count_500m', 'pm25_aod_proxy'], ascending=[False, False, False])
    else:
        raise ValueError("Mood must be 'green', 'cozy', or 'urban'")
    
    # Keep only columns used in training, in correct order
    df_mood = df_mood[X.columns]
    return df_mood

# ----------------------------
# 5️⃣ Predict cafés based on mood
# ----------------------------
def predict_top_cafe(df_input, mood_name):
    mood_df = create_mood_df(df_input, mood=mood_name)
    pred_idx = rfc.predict(mood_df)
    pred_cafes = le.inverse_transform(pred_idx)
    return pred_cafes[0]

# ----------------------------
# 6️⃣ Run predictions
# ----------------------------
top_green = predict_top_cafe(df, 'green')
top_cozy = predict_top_cafe(df, 'cozy')
top_urban = predict_top_cafe(df, 'urban')

print("Top green café:", top_green)
print("Top cozy café:", top_cozy)
print("Top urban café:", top_urban)


Top green café: LAP COFFEE_Falckensteinstraße
Top cozy café: LAP COFFEE_Karl-Marx-Straße
Top urban café: LAP COFFEE_Kurfürstendamm


In [57]:
df

Unnamed: 0,date,name,lat,lon,address,pm25_aod_proxy,geometry,temp_max,temp_min,precip_mm,cafe_rating,cafe_user_ratings_total,season,ndvi,nightlight,elevation_m,parks_count_1km,open_bars_count_500m,name_updated
0,2025-01-01,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",0.060000,POINT (13.35549 52.4867684),6.9,0.5,0.0,4.7,151.0,Winter,0.16791,23.290001,45.0,11.0,12.0,LAP COFFEE_Akazienstraße
1,2025-01-02,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",0.060000,POINT (13.35549 52.4867684),6.8,1.2,4.0,4.7,151.0,Winter,0.16791,23.290001,45.0,11.0,12.0,LAP COFFEE_Akazienstraße
2,2025-01-03,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",0.060000,POINT (13.35549 52.4867684),2.6,-1.2,4.6,4.7,151.0,Winter,0.16791,23.290001,45.0,11.0,12.0,LAP COFFEE_Akazienstraße
3,2025-01-04,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",0.066000,POINT (13.35549 52.4867684),0.8,-2.3,0.0,4.7,151.0,Winter,0.16791,23.290001,45.0,11.0,12.0,LAP COFFEE_Akazienstraße
4,2025-01-05,LAP COFFEE,52.486768,13.355490,"Akazienstraße 3A, 10823 Berlin, Germany",0.061000,POINT (13.35549 52.4867684),0.4,-2.4,8.8,4.7,151.0,Winter,0.16791,23.290001,45.0,11.0,12.0,LAP COFFEE_Akazienstraße
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,2025-10-20,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",0.097444,POINT (13.4189707 52.5011783),13.0,3.4,0.0,4.4,41.0,Autumn,0.16791,38.150437,41.0,13.0,14.0,LAP COFFEE_Adalbertstraße
4748,2025-10-21,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",0.061625,POINT (13.4189707 52.5011783),15.9,8.7,1.1,4.4,41.0,Autumn,0.16791,38.150437,41.0,13.0,14.0,LAP COFFEE_Adalbertstraße
4749,2025-10-22,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",0.101250,POINT (13.4189707 52.5011783),16.2,9.8,1.0,4.4,41.0,Autumn,0.16791,38.150437,41.0,13.0,14.0,LAP COFFEE_Adalbertstraße
4750,2025-10-23,LAP COFFEE,52.501178,13.418971,"Adalbertstraße 91, 10999 Berlin, Germany",0.085500,POINT (13.4189707 52.5011783),16.1,10.0,3.5,4.4,41.0,Autumn,0.16791,38.150437,41.0,13.0,14.0,LAP COFFEE_Adalbertstraße
