### Installing Required Libraries

In [1]:
import pandas as pd

In [2]:
from  sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

In [3]:
from datetime import datetime

### Loading the Dataset

In [4]:
dataset = pd.read_csv('/kaggle/input/social-media-users/SocialMediaUsersDataset.csv')
dataset = dataset.head(5000)
dataset

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela
...,...,...,...,...,...,...,...
4995,4996,Kelly Rodgers,Female,1966-03-23,'Cooking',Laval,France
4996,4997,Brian Morrison,Male,1988-06-05,'Health and wellness',Gyumri,Armenia
4997,4998,Micah Hoagland,Female,1970-05-11,"'Beauty', 'Fashion'",Chaska,United States
4998,4999,Greg Senz,Male,1979-04-13,'Travel',Emet,Turkey


# Feature Extraction

### Interests Feature Extraction

In [5]:
interests = dataset['Interests'].str.get_dummies(', ')  # One-hot encode interests
interests.fillna(0, inplace=True)  # Replace NaN values with 0
interests

Unnamed: 0,'Art','Beauty','Books','Business and entrepreneurship','Cars and automobiles','Cooking','DIY and crafts','Education and learning','Fashion','Finance and investments',...,'Outdoor activities','Parenting and family','Pets','Photography','Politics','Science','Social causes and activism','Sports','Technology','Travel'
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


#### Calculating Age and adding it as a column

In [6]:
dob = pd.to_datetime(dataset['DOB'])
current_date = datetime.now()
dataset['Age'] = (current_date - dob).astype('<m8[Y]')
dataset

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country,Age
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia,66.0
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya,20.0
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan,24.0
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy,39.0
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela,69.0
...,...,...,...,...,...,...,...,...
4995,4996,Kelly Rodgers,Female,1966-03-23,'Cooking',Laval,France,58.0
4996,4997,Brian Morrison,Male,1988-06-05,'Health and wellness',Gyumri,Armenia,36.0
4997,4998,Micah Hoagland,Female,1970-05-11,"'Beauty', 'Fashion'",Chaska,United States,54.0
4998,4999,Greg Senz,Male,1979-04-13,'Travel',Emet,Turkey,45.0


### Encoded Gender Feature Extraction

In [7]:
gender = dataset[['Gender']]
gender_encoded = pd.get_dummies(gender)
gender_encoded

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0
...,...,...
4995,1,0
4996,0,1
4997,1,0
4998,0,1


In [8]:
age = dataset[['Age']]
age

Unnamed: 0,Age
0,66.0
1,20.0
2,24.0
3,39.0
4,69.0
...,...
4995,58.0
4996,36.0
4997,54.0
4998,45.0


### Encoded Location Feature Extraction

In [9]:
location = dataset[['City', 'Country']]
location_encoded = pd.get_dummies(location)
location_encoded

Unnamed: 0,City_Abano Terme,City_Abay,City_Abbiategrasso,City_Abeokuta,City_Aberdeen,City_Abiko,City_Abinsk,City_Abohar,City_Abrama,City_Abuja,...,Country_United States,Country_Uruguay,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Western Sahara,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# User similarity calculation

In [10]:
features = pd.concat([interests, gender_encoded, age, location_encoded], axis=1)
features

Unnamed: 0,'Art','Beauty','Books','Business and entrepreneurship','Cars and automobiles','Cooking','DIY and crafts','Education and learning','Fashion','Finance and investments',...,Country_United States,Country_Uruguay,Country_Uzbekistan,Country_Vatican,Country_Venezuela,Country_Vietnam,Country_Western Sahara,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculating cosine similarity between users based on their extracted features

In [11]:
similarity_matrix = cosine_similarity(features)

In [12]:
user_id = 5
similar_users_indices = similarity_matrix[user_id - 1].argsort()[::-1]  # Sort indices in descending order
similar_users = similar_users_indices[1:6]  # Get top 5 similar users (excluding the user itself)

In [13]:
# Print the similar users
for similar_user_index in similar_users:
    print(f"Similar User ID: {similar_user_index + 1}")

Similar User ID: 2251
Similar User ID: 53
Similar User ID: 371
Similar User ID: 127
Similar User ID: 179
