Importing the Dependencies


In [1]:
import pandas as pd
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import numpy as np


In [None]:
def is_outside_safe_zone(lat, lon, center_lat=27.3389, center_lon=88.6065, safe_radius_km=20):
    distance = geodesic((lat, lon), (center_lat, center_lon)).km
    return 1 if distance > safe_radius_km else 0

def is_in_dangerous_area(lat, lon, threshold_km=0.5):
    dangerous_areas = [
        {'latitude': 27.9475, 'longitude': 88.3315},
        {'latitude': 27.2200, 'longitude': 88.6020},
        {'latitude': 27.3450, 'longitude': 88.8790},
        {'latitude': 27.4205, 'longitude': 88.9314},
        {'latitude': 27.4120, 'longitude': 88.9570},
    ]
    current_loc = (lat, lon)
    for area in dangerous_areas:
        area_loc = (area['latitude'], area['longitude'])
        distance = geodesic(current_loc, area_loc).km
        if distance <= threshold_km:
            return 1
    return 0

Data Collection and Processing

In [None]:
# loading the csv data to a Pandas DataFrame
df = pd.read_csv('/content/synthetic_tourist_movements_sikkim.csv')

In [None]:
# print first 5 rows of the dataset
df.head()

Unnamed: 0,latitude,longitude,timestamp,speed,zone_id,SOS
0,27.409117,88.642231,2025-09-14 22:28:01,1.07,4,0
1,27.239048,88.475297,2025-09-14 22:33:01,1.12,1,0
2,27.221757,88.597712,2025-09-14 22:38:01,2.65,4,0
3,27.298366,88.707427,2025-09-14 22:43:01,4.34,1,0
4,27.474402,88.649485,2025-09-14 22:48:01,3.57,5,0


In [None]:
# print last 5 rows of the dataset
df.tail()

Unnamed: 0,latitude,longitude,timestamp,speed,zone_id,SOS
495,27.190005,88.643209,2025-09-16 15:43:01,5.54,5,0
496,27.236856,88.640526,2025-09-16 15:48:01,1.14,1,0
497,27.344989,88.665947,2025-09-16 15:53:01,3.8,2,0
498,27.335828,88.635859,2025-09-16 15:58:01,3.23,1,0
499,27.313984,88.63398,2025-09-16 16:03:01,1.18,5,0


In [None]:
# number of rows and columns in the dataset
df.shape

(500, 6)

In [None]:
# getting some info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   500 non-null    float64
 1   longitude  500 non-null    float64
 2   timestamp  500 non-null    object 
 3   speed      500 non-null    float64
 4   zone_id    500 non-null    int64  
 5   SOS        500 non-null    int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 23.6+ KB


In [None]:
# checking for missing values
df.isnull().sum()

Unnamed: 0,0
latitude,0
longitude,0
timestamp,0
speed,0
zone_id,0
SOS,0


In [None]:
# statistical measures about the data
df.describe()

Unnamed: 0,latitude,longitude,speed,zone_id,SOS
count,500.0,500.0,500.0,500.0,500.0
mean,27.33776,88.618781,2.94392,2.962,0.014
std,0.073833,0.082976,1.667135,1.394434,0.117608
min,27.152785,88.432031,0.14,1.0,0.0
25%,27.293444,88.568792,1.49,2.0,0.0
50%,27.335564,88.62015,2.815,3.0,0.0
75%,27.385174,88.66853,4.43,4.0,0.0
max,27.508828,88.805941,6.0,5.0,1.0


In [None]:
df['SOS'].value_counts()

Unnamed: 0_level_0,count
SOS,Unnamed: 1_level_1
0,493
1,7


In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(by=['timestamp']).reset_index(drop=True)
df['time_spent_in_zone'] = df['timestamp'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds()
center_lat, center_lon = 27.3389, 88.6065
df['distance_to_zone_center'] = df.apply(lambda row: geodesic(
    (row['latitude'], row['longitude']),
    (center_lat, center_lon)
).meters, axis=1)

df['near_dangerous_area'] = df.apply(lambda row: is_in_dangerous_area(row['latitude'], row['longitude']), axis=1)
df['outside_safe_zone'] = df.apply(lambda row: is_outside_safe_zone(row['latitude'], row['longitude']), axis=1)

features = ['speed', 'time_spent_in_zone', 'distance_to_zone_center', 'near_dangerous_area', 'outside_safe_zone']
X = df[features].fillna(0)
y_true = df['SOS']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Splitting the Data into Training data & Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_true, test_size=0.3, random_state=42)


In [None]:
print(X_scaled.shape, X_train.shape, X_test.shape)

(500, 5) (350, 5) (150, 5)


Model Training

Isolation Forest

In [None]:
model = IsolationForest(contamination=0.02, random_state=42)
model.fit(X_train)

Model Evaluation

In [None]:
y_train_pred = np.where(model.predict(X_train) == -1, 1, 0)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

Training Accuracy: 0.9657142857142857
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       345
           1       0.00      0.00      0.00         5

    accuracy                           0.97       350
   macro avg       0.49      0.49      0.49       350
weighted avg       0.97      0.97      0.97       350



In [None]:
y_test_pred = np.where(model.predict(X_test) == -1, 1, 0)
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Testing Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       148
           1       0.00      0.00      0.00         2

    accuracy                           0.93       150
   macro avg       0.49      0.47      0.48       150
weighted avg       0.97      0.93      0.95       150



In [None]:
# Save model and scaler
joblib.dump(model, 'anomaly_detection_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')

from google.colab import files
files.download('anomaly_detection_model.pkl')
files.download('feature_scaler.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>