Author: Ben Soli


In [None]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Read in data

In [3]:

with open('./yoga_pose_data.json', 'r', encoding='utf8') as data:
    yoga_dict = json.load(data)



Splits corpus data into sentences and adds to lists for organizing data

In [5]:
names = []
descriptions = []
benefits = []
yoga_df = pd.DataFrame(columns=['names', 'description', 'benefits'])
for name, meta_data in yoga_dict.items():
    names.append(meta_data['Pose Name'])
    descriptions.extend(meta_data['Pose Description'].split('.'))
    benefits.extend(meta_data['Pose Benefits'].split('.'))




Creates lists of labels for text data

In [6]:
name_label = [2 for _ in names]
desc_label = [1 for _ in descriptions]
benefits_label = [0 for _ in benefits]


In [7]:
labels = LabelEncoder()
labels.fit(['names', 'description', 'benefits'])
print(labels.transform(['names', 'description', 'benefits']))


[2 1 0]


Creates a data frame containing data and labels

In [8]:
data_df = pd.DataFrame(columns=['text', 'label'])
data_df['text'] =  names + descriptions + benefits
data_df['label'] = name_label + desc_label + benefits_label

Shuffles the dataframe 

In [9]:
data_df = data_df.sample(frac=1)

Creates a column in the df containing the embedding for the text

In [35]:
data_df['embedding'] = [embedding for embedding in model.encode(data_df['text'].tolist())]



NameError: name 'ndarray' is not defined

In [38]:
print(data_df['text'].tolist())


['Strengthens the arms, the wrists, and the legs', '', '  Bend one knee and externally rotate it out to the side, upward and back', '', 'From Triangle (Preparation), flip your front palm and hinge up and back', '  Improves digestion', '', 'Cat', '  Rejuvenates the entire body', '  Strengthens thighs, knees, and ankles', '  Bend one knee in toward your chest and place your foot on the floor, with your heel as close to the same side sitting bone as possible', 'From Dolphin, root down into your forearms and come up high on your tiptoes', '  The arms are lifted up toward the sky with the elbows straight and the biceps by the ears', '', '  Relieves headache, insomnia, back pain, and fatigue', '  Keep the opposite leg engaged to assist on keeping the hips squared', '  The back foot is hooked on the inside of the elbow of the back arm', '  The pelvis is tucked', '', '  The head of the thighbone of the standing leg presses back toward the heel and is actively rooted into the earth', '  The hea

Creates train and test data for supervised learning

In [None]:
train, test = train_test_split(data_df, test_size=.2)

In [None]:
x_train = train['embedding'].tolist()
y_train = train['label'].tolist()

x_test = test['embedding'].tolist()
y_test = test['label'].tolist()



In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
preds = lr.predict(x_test)


In [None]:
print(classification_report(labels.inverse_transform(y_test), labels.inverse_transform(preds)))

In [None]:
rdf = RandomForestClassifier()
rdf.fit(x_train, y_train)
preds = rdf.predict(x_test)

In [None]:
print(classification_report(labels.inverse_transform(y_test), labels.inverse_transform(preds)))

In [None]:
test_query = model.encode('i need a pose to relieve neck pain')

In [None]:
lr_pred = lr.predict(test_query.reshape(1, -1))
rdf_pred = rdf.predict(test_query.reshape(1, -1))

In [None]:
print(labels.inverse_transform(lr_pred))
print(labels.inverse_transform(rdf_pred))

In [None]:
import numpy as np

Evalutes models according to data resembling user queries

In [None]:
test_queries = ['poses for tight hamstrings', 'poses that are done while sitting', 'help with stress', 'happy baby', 'sleeping snake', 'poses for better balance', 'goddess', 'core strength', 'silly emperor penguin']

test_embs = model.encode(test_queries)



print(labels.inverse_transform(lr.predict(test_embs)))
print(labels.inverse_transform(rdf.predict(test_embs)))
# print(labels.inverse_transform(rdf_preds))

In [None]:
# while the training of the model shows perfect or near perfect performance, evaluating with same queries shows the model is very biased towards classifying as 'benefits'
# more testing can be done but this may be a sign that it would be better to have the user more explicitly mark what type of search they are performing. 
# logistic regression serves a better classification head than rdf 
# 

In [None]:
def cos_sim(l1, l2):
    return np.dot(l1, l2) / (np.linalg.norm(l1) * np.linalg.norm(l2))

In [None]:
idx = np.argmax([cos_sim(test_query, emb) for emb in data_df['embedding'].tolist()])
lbl = data_df.iloc[idx]['label']
print(labels.inverse_transform([lbl]))

In [None]:
print(data_df.iloc[idx]['text'])

In [None]:
def cos_sim_classifier(data):
    classes = []
    for q in data:
        idx = np.argmax([cos_sim(q, emb) for emb in data_df['embedding'].tolist()])
        print(data_df.iloc[idx]['text'])
        lbl = data_df.iloc[idx]['label']
        classes.append(lbl)
    return classes

Evalutes the performance of cosine similarity based classifier using user queries 

In [None]:
test_queries = ['poses for tight hamstrings', 'poses that are done while sitting', 'help with stress', 'clapping seal ii', 'sleeping snake', 'poses for better balance', 'goddess', 'core strength', 'silly emperor penguin']

test_embs = model.encode(test_queries)
cos_preds = cos_sim_classifier(test_embs)

print(test_queries)
print(labels.inverse_transform(cos_preds))