# Testing the corellation and proximity of TVTC colleges to big cities

In [None]:
import numpy as np
import pandas as pd

data_path = '../data/raw/Geospatialdataoftechnicalcolleges2024.csv'
df_colleges = pd.read_csv(data_path)

data_path = '../data/raw/geoLoacations.csv'
df_cities = pd.read_csv(data_path)

# Step 1: Drop duplicates on 'المنشأة التدريبية' to count unique colleges per region
df_unique_colleges = df_colleges.drop_duplicates(subset=['المنشأة التدريبية', 'المنطقة'])

# Step 2: Calculate the number of unique colleges per region
colleges_per_region = df_unique_colleges.groupby('المنطقة')['المنشأة التدريبية'].count().reset_index()

# Rename the columns to make it clear
colleges_per_region.columns = ['المنطقة', 'number_of_colleges_in_region']

# Step 3: Merge the result back into the original DataFrame
df_colleges = pd.merge(df_colleges, colleges_per_region, on='المنطقة', how='left')

# Check the result
print(df_colleges[['المنطقة', 'number_of_colleges_in_region']].drop_duplicates())


            المنطقة  number_of_colleges_in_region
0           الشرقية                             9
1            الرياض                            33
4   المدينة المنورة                            12
5              عسير                            23
6            القصيم                            14
7             الجوف                             7
10            جازان                             9
16             حائل                             6
19           الباحة                             5
25      مكة المكرمة                            17
30             تبوك                             8
86  الحدود الشمالية                             4
97            نجران                             6


In [None]:
import numpy as np

# Haversine formula to calculate distance between two lat-long points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of Earth in kilometers
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    
    a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c  # Distance in kilometers

# Function to calculate distance to all major cities
def calculate_distances_to_cities(row, cities_df):
    distances = {}
    for _, city_row in cities_df.iterrows():
        city_name = city_row['المدينة']
        city_lat = city_row['latitude']
        city_lon = city_row['longitude']
        
        # Calculate distance to each city
        distance = haversine(row['latitude'], row['longitude'], city_lat, city_lon)
        distances[f'distance_to_{city_name}'] = distance
    
    return pd.Series(distances)

# Apply the function to each row in df_colleges
df_colleges = df_colleges.join(df_colleges.apply(lambda row: calculate_distances_to_cities(row, df_cities), axis=1))

# Calculate distance between each college and major city (example: Riyadh)
df_colleges['distance_to_riyadh'] = df_colleges.apply(lambda row: haversine(row['latitude'], row['longitude'], 24.7136, 46.6753), axis=1)

# Print the first few rows to verify
print(df_colleges.head())

df_colleges.to_csv('../data/edited/colleges_with_distances.csv', index=False, encoding='utf-16')



                       المنشأة التدريبية          المنطقة  المدينة   latitude  \
0         الكلية التقنية للبنين بالاحساء          الشرقية  الاحساء  25.478761   
1   الكلية التقنية للبنين بمحافظة الحريق           الرياض   الرياض  23.606421   
2   الكلية التقنية للبنين بمحافظة السليل           الرياض   السليل  20.454281   
3     الكلية التقنية للبنين بمحافظة ثادق           الرياض     ثادق  25.227729   
4     الكلية التقنية للبنين بمحافظة خيبر  المدينة المنورة     خيبر  25.621635   

   longitude  number_of_colleges_in_region  distance_to_الرياض  \
0  49.542337                             9          300.971678   
1  46.470750                            33          124.849441   
2  45.601435                            33          486.269230   
3  45.864288                            33           99.756342   
4  39.311318                            12          747.845330   

   distance_to_جدة  distance_to_مكة  distance_to_المدينة المنورة  \
0      1149.924501      1088.482063             

In [None]:

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Using DBSCAN for spatial clustering of colleges
coordinates = df_colleges[['latitude', 'longitude']].values
scaler = StandardScaler()
coordinates_scaled = scaler.fit_transform(coordinates)

# Fit DBSCAN to the data
db = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps for proximity
df_colleges['cluster'] = db.fit_predict(coordinates_scaled)



In [None]:
import folium
# Create a folium map centered around Riyadh
m = folium.Map(location=[24.7136, 46.6753], zoom_start=6)

major_cities = {
    'riyadh': {'latitude': 24.7136, 'longitude': 46.6753},
    'jeddah': {'latitude': 21.2854, 'longitude': 39.2376},
    'medina': {'latitude': 24.4709, 'longitude': 39.6102},
    'damam': {'latitude': 26.4207, 'longitude': 49.9777},
    'buraidah': {'latitude': 26.3378, 'longitude': 43.9747},
    'abha': {'latitude': 18.2162, 'longitude': 42.5053},
    'alhasa': {'latitude': 25.4788, 'longitude': 49.5423}
}
def get_nearest_city(row):

    # Find the city with the minimum distance for each college
    distances = {
        'riyadh': row['distance_to_الرياض'],
        'jeddah': row['distance_to_جدة'],
        'medina': row['distance_to_المدينة المنورة'],
        'damam': row['distance_to_الدمام'],
        'buraidah': row['distance_to_بريدة'],
        'abha': row['distance_to_أبها'],
        'alhasa': row['distance_to_الأحساء'],
    }
    
    
    # Print the distances (قاعد يلخبط طفشني)
    print(f"Distances for {row['المنشأة التدريبية']} ({row['المدينة']}): {distances}")

     # Get the city with the minimum distance
    nearest_city = min(distances, key=distances.get)
    return nearest_city

# Add markers for the colleges and draw lines to their nearest cities
for index, row in df_colleges.iterrows():
    # Find the nearest city for each college
    nearest_city = get_nearest_city(row)
    
    # Get the coordinates of the nearest city
    city_coords = major_cities[nearest_city]
    
    # Color code based on the number of colleges in the region
    color = 'green' if row['number_of_colleges_in_region'] < 10 else 'orange' if row['number_of_colleges_in_region'] < 20 else 'red'
    
    # Add marker for the college
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"الكلية: {row['المنشأة التدريبية']}<br>المدينة: {row['المدينة']}<br>المنطقة: {row['المنطقة']}<br>عدد الكليات في المنطقة: {row['number_of_colleges_in_region']}<br>أقرب مدينة: {nearest_city}",
        icon=folium.Icon(color=color),
    ).add_to(m)

    # Draw line to the nearest city
    folium.PolyLine(
        locations=[[row['latitude'], row['longitude']], [city_coords['latitude'], city_coords['longitude']]],
        color='blue',
        weight=2.5,
        opacity=1
    ).add_to(m)

# Add markers for the major cities
for city, coords in major_cities.items():
    folium.Marker(
        location=[coords['latitude'], coords['longitude']],
        popup=f"مدينة: {city.capitalize()}",
        icon=folium.Icon(color='blue', icon='info-sign')
    ).add_to(m)


# Save the map as an HTML file
m.save("colleges_nearest_lines.html")

# Display the map inline
m

Distances for  الكلية التقنية للبنين بالاحساء (الاحساء): {'riyadh': 300.97167818470837, 'jeddah': 1149.9245013591976, 'medina': 1006.9709915507398, 'damam': 113.42378455751593, 'buraidah': 564.9161896205077, 'abha': 1085.5539053494174, 'alhasa': 0.005709687556337424}
Distances for  الكلية التقنية للبنين بمحافظة الحريق (الرياض): {'riyadh': 124.84944074086646, 'jeddah': 786.7441418090502, 'medina': 702.9775731520147, 'damam': 471.970914076976, 'buraidah': 393.9481525369347, 'abha': 727.0858845716537, 'alhasa': 373.96938550804146}
Distances for  الكلية التقنية للبنين بمحافظة السليل (السليل): {'riyadh': 486.2692300835779, 'jeddah': 667.5765922043952, 'medina': 760.0847237828086, 'damam': 799.4903292292321, 'buraidah': 674.3663121924824, 'abha': 409.1712599850035, 'alhasa': 689.0309864231185}
Distances for  الكلية التقنية للبنين بمحافظة ثادق ( ثادق): {'riyadh': 99.75634200184203, 'jeddah': 806.2933709744178, 'medina': 636.3919980836471, 'damam': 432.5275575240674, 'buraidah': 225.6191027437

# Redundant but need fixing