In [3]:
#Step 1: Import Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans


In [7]:
#Step 2: Load Dataset

In [9]:
df = pd.read_csv("fraud_applications_sample.csv")
df.head()


Unnamed: 0,Application_ID,Applicant_Email_Count,Submission_Time_Minutes,Same_IP_Count,Experience_Years
0,1,2,120,3,0
1,2,3,60,3,5
2,3,1,60,5,5
3,4,3,120,2,0
4,5,3,60,1,0


In [11]:
#Step 3: Select Features for Fraud Detection

In [13]:
features = df[
    ["Applicant_Email_Count",
     "Submission_Time_Minutes",
     "Same_IP_Count",
     "Experience_Years"]
]


In [None]:
#Step 4: Feature Scaling

In [15]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [17]:
#Step 5: Isolation Forest (Anomaly Detection)

In [19]:
iso_model = IsolationForest(contamination=0.15, random_state=42)
df["Fraud_IF"] = iso_model.fit_predict(scaled_features)

df["Fraud_IF"] = df["Fraud_IF"].map({-1: "Suspicious", 1: "Normal"})


In [21]:
#Step 6: K-Means Clustering

In [25]:
kmeans = KMeans(n_clusters=2, random_state=42)
df["Cluster"] = kmeans.fit_predict(scaled_features)




In [27]:
#Step 7: Generate Alerts

In [29]:
df["Alert"] = np.where(
    df["Fraud_IF"] == "Suspicious",
    "ðŸš¨ Alert",
    "OK"
)

df["Alert"].value_counts()


Alert
OK         25
ðŸš¨ Alert     5
Name: count, dtype: int64

In [31]:
#Step 8: Final Result View

In [33]:
df.head()


Unnamed: 0,Application_ID,Applicant_Email_Count,Submission_Time_Minutes,Same_IP_Count,Experience_Years,Fraud_IF,Cluster,Alert
0,1,2,120,3,0,Normal,0,OK
1,2,3,60,3,5,Normal,1,OK
2,3,1,60,5,5,Suspicious,0,ðŸš¨ Alert
3,4,3,120,2,0,Suspicious,1,ðŸš¨ Alert
4,5,3,60,1,0,Suspicious,1,ðŸš¨ Alert


--This major project focuses on identifying fake or suspicious internship applications using unsupervised machine learning techniques.

Objective:
Detect anomalies in applications to prevent duplicate, automated, or inconsistent entries.

Approach:

Collected application-level data such as email duplication, submission time gaps, IP usage, and claimed experience.

Applied Isolation Forest to detect rare and abnormal patterns without labeled fraud data.

Used K-Means Clustering to group applications into normal and suspicious behavior segments.

Implemented an alert mechanism to flag high-risk applications for manual review.

Tools & Technologies:
Python, Pandas, Scikit-learn, Jupyter Notebook

Outcome:
The system successfully flagged suspicious applications, reduced manual screening effort, and provided a scalable solution to improve internship application integrity.

Impact:
Helps administrators and mentors quickly identify potential fraud, save time, and ensure fair candidate selection.