In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# import pandas as pd
# import plotly.express as px


# clean_path = "/content/drive/MyDrive/DAINA 3/final_score_no_geom_fixed.csv"

# df = pd.read_csv(clean_path, sep=",", quotechar='"', engine="python")

# print(df.shape)
# print(df.columns)
# df.head(3)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(321820, 17)
Index(['id', 'left', 'top', 'right', 'bottom', 'row_index', 'col_index',
       'area', 'perimeter', 'geometry', 'land_score', 'dem_score', 'dni_score',
       'temp_score', 'pvout_score', 'dso_score', 'pv_score'],
      dtype='object')


Unnamed: 0,id,left,top,right,bottom,row_index,col_index,area,perimeter,geometry,land_score,dem_score,dni_score,temp_score,pvout_score,dso_score,pv_score
0,676612,414729.589868,369375.174135,414979.589868,369125.174135,306,855,6656.084138,410.853786,GEOM_REMOVED,1.0,177.928571,1019.047974,9.4,1114.011963,0.034582,0.811841
1,676611,414729.589868,369625.174135,414979.589868,369375.174135,305,855,28107.499155,755.139192,GEOM_REMOVED,1.0,177.714286,1020.970893,9.4,1114.011963,0.033724,0.816066
2,676610,414729.589868,369875.174135,414979.589868,369625.174135,304,855,56502.279292,946.460361,GEOM_REMOVED,1.0,176.648936,1020.913394,9.4,1114.011963,0.03238,0.821032


In [None]:
# ==============================================================
# üß≠ DAINA Project | Data Cleaning Pipeline for MCDM Analysis
# --------------------------------------------------------------
# This cell:
#   1. Mounts Google Drive
#   2. Reads the raw exported CSV (with geometry)
#   3. Removes the MULTIPOLYGON geometry field safely
#   4. Cleans extra quotes and commas
#   5. Saves:
#        - final_score_no_geom.csv  (clean but full)
#        - mcdm_input.csv           (only id + numeric criteria)
# ==============================================================

from google.colab import drive
import pandas as pd
import re

# --- 1Ô∏è‚É£ Mount Drive ---
drive.mount('/content/drive')

# --- 2Ô∏è‚É£ Define paths ---
base_path  = "/content/drive/MyDrive/DAINA 3"
raw_path   = f"{base_path}/final_score_UTF8.csv"
no_geom    = f"{base_path}/final_score_no_geom.csv"
mcdm_ready = f"{base_path}/mcdm_input.csv"

# --- 3Ô∏è‚É£ Remove geometry safely ---
# Replace MULTIPOLYGON (...) with placeholder
with open(raw_path, "r", encoding="utf-8", errors="ignore") as infile, open(no_geom, "w", encoding="utf-8") as outfile:
    for line in infile:
        # Remove geometry text completely (handles commas inside)
        line = re.sub(r'""MULTIPOLYGON\s*\(\(.*?\)\)""', '"GEOM_REMOVED"', line)
        outfile.write(line)

print(f"‚úÖ Geometry removed and saved as: {no_geom}")

# --- 4Ô∏è‚É£ Remove extra outer quotes if they exist ---
clean_fixed = no_geom.replace(".csv", "_fixed.csv")

with open(no_geom, "r", encoding="utf-8", errors="ignore") as infile, open(clean_fixed, "w", encoding="utf-8") as outfile:
    for line in infile:
        if line.startswith('"') and line.rstrip().endswith('"'):
            line = line.strip()[1:-1]
        outfile.write(line + ("\n" if not line.endswith("\n") else ""))

print(f"‚úÖ Outer quotes cleaned and saved as: {clean_fixed}")

# --- 5Ô∏è‚É£ Load cleaned file into DataFrame ---
df = pd.read_csv(clean_fixed, sep=",", quotechar='"', engine="python")
print("‚úÖ Loaded successfully:", df.shape)
print("Columns:", df.columns.tolist())
df.head(3)

print(f"‚úÖ MCDM-ready file saved as: {mcdm_ready}")
print("üéØ Columns retained:", cols_to_keep)


In [None]:
cols_to_drop = ["left", "top", "right", "bottom", "row_index", "col_index"]
df_reduced = df.drop(columns=cols_to_drop)

print("Remaining columns:", df_reduced.columns.tolist())
df_reduced.head(3)

Remaining columns: ['id', 'area', 'perimeter', 'geometry', 'land_score', 'dem_score', 'dni_score', 'temp_score', 'pvout_score', 'dso_score', 'pv_score']


Unnamed: 0,id,area,perimeter,geometry,land_score,dem_score,dni_score,temp_score,pvout_score,dso_score,pv_score
0,676612,6656.084138,410.853786,GEOM_REMOVED,1.0,177.928571,1019.047974,9.4,1114.011963,0.034582,0.811841
1,676611,28107.499155,755.139192,GEOM_REMOVED,1.0,177.714286,1020.970893,9.4,1114.011963,0.033724,0.816066
2,676610,56502.279292,946.460361,GEOM_REMOVED,1.0,176.648936,1020.913394,9.4,1114.011963,0.03238,0.821032


In [None]:
fig_land = px.histogram(
    df_reduced,
    x="land_score",
    nbins=30,
    title="Distribution of Land Score",
    color_discrete_sequence=["orange"],
    opacity=0.8
)
fig_land.update_layout(
    xaxis_title="Land Score",
    yaxis_title="Count",
    template="plotly_white"
)
fig_land.show()

df_reduced["land_score"].describe()


Output hidden; open in https://colab.research.google.com to view.

In [None]:
px.line(df_reduced['land_score'][df_reduced['land_score'] > 1])


In [None]:
px.line(df_reduced['dso_score'])

Output hidden; open in https://colab.research.google.com to view.