### Removing all 0 and below 0 values 

In [10]:
import rasterio
import numpy as np

def process_geotiff(input_file, output_file):
    # Open the input TIFF file
    with rasterio.open(input_file) as src:
        # Read the data from the first band
        data = src.read(1)
        
        # Set negative and zero values to NaN
        data[data <= 0] = np.nan
        
        # Update the metadata to handle NaN values correctly
        profile = src.profile
        profile.update(
            dtype=rasterio.float32,  # Ensure data type can handle NaN
            nodata=np.nan            # Define nodata value
        )
        
        # Write the modified data to a new TIFF file
        with rasterio.open(output_file, 'w', **profile) as dst:
            dst.write(data, 1)

# Define input and output file paths
input_file = 'UK_DEM_merged.tif'
output_file = 'UK_DEM2.tif'

# Process the TIFF file
process_geotiff(input_file, output_file)


### Finding the diffrence in both the areas

In [15]:
import rasterio
from rasterio.enums import Resampling
from rasterio.warp import reproject
import numpy as np

# Paths to the input GeoTIFF files
raster1_path = 'UK_DEM.tif'
raster2_path = 'UK_temp.tif'
output_path = 'UK_imputation_raster_mergedDEM.tif'

# Function to create a binary mask where data is present
def create_presence_mask(data, nodata_value):
    mask = np.ones_like(data, dtype=np.uint8)
    if nodata_value is not None and np.isnan(nodata_value):
        mask[np.isnan(data)] = 0
    elif nodata_value is not None:
        mask[data == nodata_value] = 0
    return mask

# Open the input rasters
with rasterio.open(raster1_path) as src1:
    data1 = src1.read(1)
    meta1 = src1.meta
    nodata1 = src1.nodata

with rasterio.open(raster2_path) as src2:
    data2 = src2.read(1)
    nodata2 = src2.nodata

    # Resample raster2 to match raster1 if dimensions differ
    if src1.shape != src2.shape:
       
        data2_resampled = np.empty(src1.shape, dtype=np.float32)
        reproject(
            source=data2,
            destination=data2_resampled,
            src_transform=src2.transform,
            src_crs=src2.crs,
            dst_transform=src1.transform,
            dst_crs=src1.crs,
            resampling=Resampling.bilinear
        )
    else:
        data2_resampled = data2

# Create presence masks for each raster
mask1 = create_presence_mask(data1, nodata1)
mask2 = create_presence_mask(data2_resampled, nodata2)

# Calculate the difference mask
difference_mask = mask1 & ~mask2

# Update metadata for the output raster
meta1.update(dtype=rasterio.uint8, nodata=0)

# Save the difference mask as a new GeoTIFF file
with rasterio.open(output_path, 'w', **meta1) as dst:
    dst.write(difference_mask, 1)

print("The difference raster has been created and saved.")

The difference raster has been created and saved.


### Resampling the Imputation raster to pixel size of temprature raster 

In [53]:
import rasterio
from rasterio.enums import Resampling
import numpy as np

# Paths to the input GeoTIFF files
raster1_path = 'UK_temp.tif'  # This is the reference raster for resolution
raster2_path = 'UK_imputation_raster.tif'
output_path = 'UK_imputation_resampled.tif'

# Function to create a binary mask where data is present
def create_presence_mask(data, nodata_value):
    mask = np.ones_like(data, dtype=np.uint8)
    if nodata_value is not None and np.isnan(nodata_value):
        mask[np.isnan(data)] = 0
    elif nodata_value is not None:
        mask[data == nodata_value] = 0
    return mask

# Open the reference raster to get its metadata and resolution
with rasterio.open(raster1_path) as src_ref:
    # Read the data from the first band
    data_ref = src_ref.read(1)
    profile_ref = src_ref.profile
    transform_ref = src_ref.transform  # Get the transform for resampling

# Open the raster to be resampled
with rasterio.open(raster2_path) as src:
    # Read the data from the first band
    data = src.read(1)
    profile = src.profile

    # Resample raster2 to match raster1 if dimensions differ
    if src_ref.shape != src.shape:
        resampled_data = np.empty(src_ref.shape, dtype=np.float32)
        rasterio.warp.reproject(
            source=rasterio.band(src, 1),
            destination=resampled_data,
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=transform_ref,
            dst_crs=profile_ref['crs'],
            resampling=Resampling.bilinear
        )
    else:
        resampled_data = data  # No need to resample if already matching

# Create a presence mask for the resampled raster
mask_resampled = create_presence_mask(resampled_data, src.nodata)

# Update metadata for the output raster
profile_ref.update(
    dtype=rasterio.uint8,  # Update data type if necessary
    nodata=0  # Define nodata value
)

# Save the resampled mask as a new GeoTIFF file
with rasterio.open(output_path, 'w', **profile_ref) as dst:
    dst.write(mask_resampled, 1)

### New Section from here 

In [4]:
import rasterio
from shapely.geometry import Point
import geopandas as gpd
import numpy as np

# Path to your GeoTIFF file
input_file = 'UK_imputation_resampled2.tif'

# Lists to store point geometries, coordinates, and pixel values
points = []
lats = []
lons = []
values = []

# Open the GeoTIFF file
with rasterio.open(input_file) as src:
    # Get the number of rows and columns in the raster
    rows, cols = src.shape
    
    # Get the nodata value
    nodata_value = src.nodata

    # Read the entire raster data into memory
    data = src.read(1)

    # Iterate through each pixel in the raster
    for row in range(rows):
        for col in range(cols):
            # Read pixel value
            pixel_value = data[row, col]
            
            # Check if pixel value is not nodata and not NaN
            if pixel_value != nodata_value and not np.isnan(pixel_value):
                # Convert pixel coordinates to geographic coordinates (longitude, latitude)
                lon, lat = src.xy(row, col)

                # Create a point geometry for each valid pixel coordinate
                point = Point(lon, lat)
                points.append(point)
                lats.append(lat)
                lons.append(lon)
                values.append(pixel_value)

# Create a GeoDataFrame from the list of points
gdf = gpd.GeoDataFrame(geometry=points, crs=src.crs)

# Add latitude, longitude, and pixel values to the GeoDataFrame
gdf['latitude'] = lats
gdf['longitude'] = lons
gdf['value'] = values

# Save the GeoDataFrame as a shapefile
output_shapefile = 'UK_imputation2.shp'
gdf.to_file(output_shapefile)

print(f"Shapefile saved successfully: {output_shapefile}")

Shapefile saved successfully: UK_imputation2.shp


In [59]:
import geopandas as gpd

# Paths to your shapefiles
file1_path = 'UK_imputation.shp'
file2_path = 'UK_temp.shp'
output_path = 'UK_imputation.shp'

# Load the shapefiles
gdf1 = gpd.read_file(file1_path)
gdf2 = gpd.read_file(file2_path)

# Ensure both GeoDataFrames have the same CRS
if gdf1.crs != gdf2.crs:
    gdf2 = gdf2.to_crs(gdf1.crs)

# Identify points in gdf1 that are not in gdf2 based on latitude and longitude
gdf1_filtered = gdf1[~gdf1.apply(lambda x: (x['latitude'], x['longitude']) in 
                                set(gdf2.apply(lambda y: (y['latitude'], y['longitude']), axis=1)), axis=1)]

# Save the filtered GeoDataFrame to a new shapefile
gdf1_filtered.to_file(output_path)

print(f"Filtered shapefile saved successfully: {output_path}")


Filtered shapefile saved successfully: UK_imputation.shp


### Genrate another resample mask from diffrence and apply it 

In [None]:
import rasterio
from shapely.geometry import Point
import geopandas as gpd
import numpy as np

# Path to your GeoTIFF file
input_file = 'UK_imputation_resampled2.tif'

# Lists to store point geometries, coordinates, and pixel values
points = []
lats = []
lons = []
values = []

# Open the GeoTIFF file
with rasterio.open(input_file) as src:
    # Get the number of rows and columns in the raster
    rows, cols = src.shape
    
    # Get the nodata value
    nodata_value = src.nodata

    # Read the entire raster data into memory
    data = src.read(1)

    # Iterate through each pixel in the raster
    for row in range(rows):
        for col in range(cols):
            # Read pixel value
            pixel_value = data[row, col]
            
            # Check if pixel value is not nodata and not NaN
            if pixel_value != nodata_value and not np.isnan(pixel_value):
                # Convert pixel coordinates to geographic coordinates (longitude, latitude)
                lon, lat = src.xy(row, col)

                # Create a point geometry for each valid pixel coordinate
                point = Point(lon, lat)
                points.append(point)
                lats.append(lat)
                lons.append(lon)
                values.append(pixel_value)

# Create a GeoDataFrame from the list of points
gdf = gpd.GeoDataFrame(geometry=points, crs=src.crs)

# Add latitude, longitude, and pixel values to the GeoDataFrame
gdf['latitude'] = lats
gdf['longitude'] = lons
gdf['value'] = values

# Save the GeoDataFrame as a shapefile
output_shapefile = 'UK_imputation2.shp'
gdf.to_file(output_shapefile)

print(f"Shapefile saved successfully: {output_shapefile}")


In [6]:
import geopandas as gpd

# Paths to your shapefiles
file1_path = 'UK_imputation2.shp'
file2_path = 'UK_temp.shp'
file3_path = 'UK_imputation.shp'  # Path to the third shapefile
output_path = 'UK_imputation_points2.shp'  # Ensure this is a new output file name

# Load the shapefiles
gdf1 = gpd.read_file(file1_path)
gdf2 = gpd.read_file(file2_path)
gdf3 = gpd.read_file(file3_path)

# Ensure all GeoDataFrames have the same CRS
if gdf1.crs != gdf2.crs:
    gdf2 = gdf2.to_crs(gdf1.crs)
if gdf1.crs != gdf3.crs:
    gdf3 = gdf3.to_crs(gdf1.crs)

# Create sets of (latitude, longitude) tuples for gdf2 and gdf3
set_gdf2 = set(gdf2.apply(lambda y: (y['latitude'], y['longitude']), axis=1))
set_gdf3 = set(gdf3.apply(lambda y: (y['latitude'], y['longitude']), axis=1))

# Combine the sets
combined_set = set_gdf2.union(set_gdf3)

# Identify points in gdf1 that are not in either gdf2 or gdf3
gdf1_filtered = gdf1[~gdf1.apply(lambda x: (x['latitude'], x['longitude']) in combined_set, axis=1)]






Filtered shapefile saved successfully: UK_imputation_points2.shp


Combnine points from your and other resample mask 

In [10]:
import geopandas as gpd
import pandas as pd

final_output_path = 'final_points.shp'  # Path for the final merged shapefile

# Load the filtered shapefile and the third shapefile
filtered_points = gpd.read_file(output_path)
imputation_points = gpd.read_file(file3_path)

# Merge the two GeoDataFrames using pandas.concat
final_points = gpd.GeoDataFrame(pd.concat([filtered_points, imputation_points], ignore_index=True))

# Ensure the final GeoDataFrame has the same CRS
final_points = final_points.set_crs(gdf1.crs)

# Save the merged GeoDataFrame to a new shapefile
final_points.to_file(final_output_path)

print(f"Final merged shapefile saved successfully: {final_output_path}")

Final merged shapefile saved successfully: final_points.shp


In [7]:
import geopandas as gpd

# Load shapefiles
missing_points = gpd.read_file('UK_imputation.shp')
complete_data = gpd.read_file('UK_temp.shp')

from sklearn.neighbors import KNeighborsRegressor
from shapely.geometry import Point

# Convert GeoDataFrame to DataFrame for KNN processing
X_known = complete_data[['latitude', 'longitude']]
y_known = complete_data['value']
X_missing = missing_points[['latitude', 'longitude']]


# Initialize KNN regressor
knn = KNeighborsRegressor(n_neighbors=3, weights='distance')

# Fit KNN model
knn.fit(X_known, y_known)

# Predict missing temperatures
imputed_temperatures = knn.predict(X_missing)

# Add imputed temperatures to missing_points GeoDataFrame
missing_points['temperature'] = imputed_temperatures

# Save GeoDataFrame to shapefile
missing_points.to_file('UK_imputed_KNN.shp')

  missing_points.to_file('UK_imputed_KNN.shp')


### Idea 

1) So first we check how many 30m points are there around a 12km2 point
2) Next we have lat and long of each thing so find distances using that lat and long
3) Find some relation between elevation and temprature value 


Basic Idea need to do a lot of other changes

Need to Do 
1) Check resampling algos 
2) Check imputation algos 
3) Check about KNN (elbow rule)
3) explore more into using DEM while doing imputation 
4) the observed dataset I was checking mostly doesn't have points on the areas where we are conducting imputation