# Initial Steps

## Install Dependencies

In [38]:
# !pip install --upgrade pip
# !pip install google-cloud-aiplatform
# !pip install scikit-learn
# !pip install mplcursors
!pip install ipympl

Defaulting to user installation because normal site-packages is not writeable
[0mCollecting ipympl
  Downloading ipympl-0.9.4-py3-none-any.whl.metadata (8.7 kB)
Downloading ipympl-0.9.4-py3-none-any.whl (516 kB)
[0mInstalling collected packages: ipympl
[0mSuccessfully installed ipympl-0.9.4


In [49]:
# %cd ..
# !ls
!source llm-venv/bin/activate

## Setup GCP and VertexAI

In [4]:
import json
with open("../resources/config.json") as f:
    config = json.load(f)
    

In [5]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

In [7]:
# Path to your service account key file
key_path = "../resources/keys.json" #Path to the json key associated with your service account from google cloud

In [8]:
# Create credentials object

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

In [9]:
# Connect to vertexAI
import vertexai

PROJECT_ID = config['gcp']['project_id']
REGION = config['gcp']['region']

# initialize vertex
vertexai.init(project = PROJECT_ID, location = REGION, credentials = credentials)

## Get start with text embeddings

In [10]:
from vertexai.language_models import TextEmbeddingModel

In [11]:
# load the model

embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko")

In [8]:
# Generate word embeddigs
# The returned object is a list with a single TextEmbedding object.
# The TextEmbedding.values field stores the embeddings in a Python list.

embedding = embedding_model.get_embeddings(
    ["life"]
)

In [10]:
# The TextEmbedding.values field stores the embeddings in a Python list.
vector = embedding[0].values

print(f"Length of vector = {len(vector)}")
print(vector[:10])

Length of vector = 768
[0.013755870051681995, -0.03912805765867233, -0.008546194061636925, -0.02595570497214794, 0.01972811669111252, -0.014042137190699577, -0.006056010257452726, -0.010783486068248749, -0.0036132442764937878, -0.00030527994385920465]


In [22]:
# Generate sentence embeddings
sen1 = ["what computer system"]
sen2 = ["Explain life"]

emb1 = embedding_model.get_embeddings(sen1)
emb2 = embedding_model.get_embeddings(sen2)

In [23]:
# compare two embeddings
from sklearn.metrics.pairwise import cosine_similarity

# first get the vectors
vec1 = [emb1[0].values]
vec2 = [emb2[0].values]

In [24]:
print(cosine_similarity(vec1,vec2))

[[0.55529533]]


## Understand Embeddings

## Visualizing Embeddings

In [12]:
in_1 = "Missing flamingo discovered at swimming pool"

in_2 = "Sea otter spotted on surfboard by beach"

in_3 = "Baby panda enjoys boat ride"


in_4 = "Breakfast themed food truck beloved by all!"

in_5 = "New curry restaurant aims to please!"


in_6 = "Python developers are wonderful people"

in_7 = "TypeScript, C++ or Java? All are great!" 


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [13]:
import numpy as np

In [14]:
embeddings = []
for inp_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [inp_text])[0].values
    embeddings.append(emb)

embedding_array = np.array(embeddings)

In [15]:
print("Shape : " + str(embedding_array.shape))
print(embedding_array)

Shape : (7, 768)
[[ 0.00848222  0.00836626 -0.02967997 ...  0.00860982 -0.0410819
   0.00256639]
 [-0.0551771  -0.01340853 -0.03359561 ... -0.01553276  0.00358049
  -0.0037684 ]
 [-0.03348316  0.00433433 -0.02988683 ... -0.00524339  0.02533281
   0.00336217]
 ...
 [ 0.04009016 -0.05315267 -0.05048409 ...  0.02780198 -0.02067361
   0.01399049]
 [-0.01205887  0.00141472 -0.02448857 ...  0.02541188 -0.07642936
   0.06357682]
 [ 0.01457254 -0.02650255 -0.05243243 ...  0.00474664 -0.01937744
   0.00399636]]


In [16]:
# Reduce the high dim data to low dim using PCA
# Perform PCA for 2D visualization

from sklearn.decomposition import PCA

PCA_model = PCA(n_components = 2)
PCA_model.fit(embedding_array)
new_values = PCA_model.transform(embedding_array)

In [17]:
print("Shape: " + str(new_values.shape))
print(new_values)

Shape: (7, 2)
[[ 0.37876705 -0.04326331]
 [ 0.36152641 -0.25299494]
 [ 0.33130998 -0.08953431]
 [-0.16780305  0.39842445]
 [-0.02711076  0.483813  ]
 [-0.4234599  -0.25245445]
 [-0.45322972 -0.24399045]]
