### Installing Required Libraries

In [1]:
import pandas as pd

In [2]:
from  sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [3]:
from datetime import datetime

### Loading the Dataset

In [4]:
dataset = pd.read_csv('SocialMediaUsersDataset.csv')
dataset = dataset.head(10000)

In [5]:
dataset

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela
...,...,...,...,...,...,...,...
9995,9996,Mary Waddell,Female,1999-09-25,"'Pets', 'Music', 'Pets', 'Fitness'",Bālugaon,India
9996,9997,Philip Mcmanus,Female,1991-05-26,"'Fitness', 'Sports', 'Finance and investments'...",Michelstadt,Germany
9997,9998,Christina Frandsen,Female,1957-09-25,"'Travel', 'Outdoor activities', 'Gaming', 'Car...",Ropar,India
9998,9999,Carrie Weber,Male,1954-05-20,"'Social causes and activism', 'Pets', 'Books',...",North Decatur,United States


# Feature Extraction

### Interests Feature Extraction

In [6]:
interests = dataset['Interests'].str.get_dummies(', ')  # One-hot encode interests
interests.fillna(0, inplace=True)  # Replace NaN values with 0

In [7]:
interests

Unnamed: 0,'Art','Beauty','Books','Business and entrepreneurship','Cars and automobiles','Cooking','DIY and crafts','Education and learning','Fashion','Finance and investments',...,'Outdoor activities','Parenting and family','Pets','Photography','Politics','Science','Social causes and activism','Sports','Technology','Travel'
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9996,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
9997,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
9998,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


#### Calculating Age and adding it as a column

In [8]:
dataset['DOB'] = pd.to_datetime(dataset['DOB'])

# Get the current date
current_date = datetime.now()

# Function to calculate age
def calculate_age(dob):
    return current_date.year - dob.year - ((current_date.month, current_date.day) < (dob.month, dob.day))

# Apply the function to calculate the 'Age'
dataset['Age'] = dataset['DOB'].apply(calculate_age)

# Display the dataset with the new 'Age' column
dataset[['UserID', 'Name', 'DOB', 'Age']].head()

Unnamed: 0,UserID,Name,DOB,Age
0,1,Jesse Lawhorn,1958-10-15,66
1,2,Stacy Payne,2004-07-21,20
2,3,Katrina Nicewander,2000-02-07,24
3,4,Eric Yarbrough,1985-04-14,39
4,5,Daniel Adkins,1955-09-18,69


### Encoded Gender Feature Extraction

In [9]:
gender = dataset[['Gender']]
gender_encoded = pd.get_dummies(gender)

In [10]:
gender_encoded

Unnamed: 0,Gender_Female,Gender_Male
0,True,False
1,True,False
2,True,False
3,False,True
4,True,False
...,...,...
9995,True,False
9996,True,False
9997,True,False
9998,False,True


In [11]:
age = dataset[['Age']]

In [12]:
age

Unnamed: 0,Age
0,66
1,20
2,24
3,39
4,69
...,...
9995,25
9996,33
9997,67
9998,70


### Encoded Location Feature Extraction

In [13]:
location = dataset[['City', 'Country']]
location_encoded = pd.get_dummies(location)

In [14]:
location_encoded

Unnamed: 0,City_Aalen,City_Aarschot,City_Abadan,City_Abancay,City_Abano Terme,City_Abasolo,City_Abay,City_Abaza,City_Abbiategrasso,City_Abeokuta,...,Country_United States,Country_Uruguay,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Western Sahara,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


# User similarity calculation

In [15]:
features = pd.concat([interests, gender_encoded, age, location_encoded], axis=1)

In [16]:
features

Unnamed: 0,'Art','Beauty','Books','Business and entrepreneurship','Cars and automobiles','Cooking','DIY and crafts','Education and learning','Fashion','Finance and investments',...,Country_United States,Country_Uruguay,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Western Sahara,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,0,0,1,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
1,0,0,0,0,0,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,1,0,1,0,...,False,False,False,False,False,False,False,False,False,False
3,0,0,0,0,1,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
9996,0,0,0,0,0,1,0,0,0,1,...,False,False,False,False,False,False,False,False,False,False
9997,0,0,0,0,1,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
9998,0,0,1,0,0,0,0,0,0,0,...,True,False,False,False,False,False,False,False,False,False


### Calculating cosine similarity between users based on their extracted features

In [17]:
similarity_matrix = cosine_similarity(features)

In [19]:
similarity_matrix[0]

array([1.        , 0.99143166, 0.99454875, ..., 0.99875958, 0.99881512,
       0.99881609])

In [21]:
user_id = 1
similar_users_indices = similarity_matrix[user_id - 1].argsort()[::-1]  # Sort indices in descending order
similar_users = similar_users_indices[1:6]  # Get top 5 similar users

In [22]:
# Print the similar users
for similar_user_index in similar_users:
    print(f"Similar User ID: {similar_user_index + 1}")

Similar User ID: 9560
Similar User ID: 4497
Similar User ID: 6360
Similar User ID: 1119
Similar User ID: 654
