In [None]:
!pip install tensorflow-text




In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
!gdown https://drive.google.com/uc?id=14po5DDA9kyQtKY0x5uhhO6vc4cCci6lE

Downloading...
From: https://drive.google.com/uc?id=14po5DDA9kyQtKY0x5uhhO6vc4cCci6lE
To: /content/gym_members_exercise_tracking.csv
  0% 0.00/65.1k [00:00<?, ?B/s]100% 65.1k/65.1k [00:00<00:00, 57.1MB/s]


In [None]:
df = pd.read_csv('gym_members_exercise_tracking.csv')
df

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.20
1,46,Female,74.9,1.53,179,151,66,1.30,883.0,HIIT,33.9,2.1,4,2,32.00
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.70,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,24,Male,87.1,1.74,187,158,67,1.57,1364.0,Strength,10.0,3.5,4,3,28.77
969,25,Male,66.6,1.61,184,166,56,1.38,1260.0,Strength,25.0,3.0,2,1,25.69
970,59,Female,60.4,1.76,194,120,53,1.72,929.0,Cardio,18.8,2.7,5,3,19.50
971,32,Male,126.4,1.83,198,146,62,1.10,883.0,HIIT,28.2,2.1,3,2,37.74


In [None]:
# Data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            973 non-null    int64  
 1   Gender                         973 non-null    object 
 2   Weight (kg)                    973 non-null    float64
 3   Height (m)                     973 non-null    float64
 4   Max_BPM                        973 non-null    int64  
 5   Avg_BPM                        973 non-null    int64  
 6   Resting_BPM                    973 non-null    int64  
 7   Session_Duration (hours)       973 non-null    float64
 8   Calories_Burned                973 non-null    float64
 9   Workout_Type                   973 non-null    object 
 10  Fat_Percentage                 973 non-null    float64
 11  Water_Intake (liters)          973 non-null    float64
 12  Workout_Frequency (days/week)  973 non-null    int

In [None]:
# Output distribution
df['Workout_Type'].value_counts()

Unnamed: 0_level_0,count
Workout_Type,Unnamed: 1_level_1
Strength,258
Cardio,255
Yoga,239
HIIT,221


In [None]:
# Get all spams labels
spam_df = df[df['Workout_Type']=='Yoga']
spam_df.shape

(239, 15)

In [None]:
# Get all spams labels
spam_df = df[df['Workout_Type']=='HIIT']
spam_df.shape

(221, 15)

In [None]:
# Get random sample from ham data in size of spam data
ham_df_downsampled = ham_df.sample(spam_df.shape[0])
ham_df_downsampled.shape

(221, 15)

In [None]:
# Create balanced data that contains the same size of two categories
balanced_df = pd.concat([ham_df_downsampled, spam_df])
balanced_df.shape

(442, 15)

In [None]:
# New balanced data distribution
balanced_df['Workout_Type'].value_counts()

Unnamed: 0_level_0,count
Workout_Type,Unnamed: 1_level_1
HIIT,274
Strength,61
Cardio,54
Yoga,53


In [None]:
# Membuat kolom baru dengan nilai yang dienkode: 'Yoga' sebagai 1 dan lainnya sebagai 0
balanced_df['spam'] = balanced_df['Workout_Type'].apply(lambda x: 1 if x == 'Yoga' else 0)

# Menampilkan sampel data untuk memverifikasi hasil
balanced_df.sample(5)


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,spam
874,43,Female,40.5,1.74,187,143,51,1.5,965.0,HIIT,32.9,2.2,3,2,13.38,0
652,50,Male,64.5,1.79,163,130,61,1.17,753.0,Yoga,28.1,2.4,4,2,20.13,1
475,27,Male,89.3,1.64,192,162,64,1.82,1622.0,HIIT,12.1,3.5,4,3,33.2,0
636,26,Male,47.7,1.77,198,120,69,1.15,759.0,Strength,24.8,2.7,3,2,15.23,0
390,45,Male,58.4,1.72,194,150,65,1.31,973.0,HIIT,23.4,3.1,4,2,19.74,0


In [None]:
# Membagi data menjadi fitur dan target
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['Workout_Type'],  # Fitur, di sini menggunakan Workout_Type
    balanced_df['spam'],          # Target (kelas biner)
    stratify=balanced_df['spam']  # Stratifikasi untuk menjaga proporsi kelas
)


In [None]:
# Preprocessing model
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# BERT model
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
# This function apply bert preprocessing on sentences
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [86]:
# Fungsi untuk menghasilkan embedding dari kalimat menggunakan BERT
def get_sentence_embedding(sentences):
    preprocessed_text = bert_preprocess(sentences)  # Preprocessing BERT pada kalimat
    return bert_encoder(preprocessed_text)['pooled_output']  # Mengembalikan embedding BERT

# Ambil contoh nilai dari kolom Workout_Type untuk digunakan sebagai input
sample_sentences = balanced_df['Workout_Type'].sample(6).tolist()  # Ambil 6 sampel acak

# Generate embedding menggunakan fungsi get_sentence_embedding
e = get_sentence_embedding(sample_sentences)

# Hitung kemiripan kosinus antara beberapa embedding
print("Cosine similarity between first and second:", cosine_similarity([e[0]], [e[1]]))
print("Cosine similarity between third and fourth:", cosine_similarity([e[2]], [e[3]]))
print("Cosine similarity between fifth and sixth:", cosine_similarity([e[4]], [e[5]]))


Cosine similarity between first and second: [[0.9838908]]
Cosine similarity between third and fourth: [[0.97140557]]
Cosine similarity between fifth and sixth: [[0.92914367]]
