# Initial Steps

## Install Dependencies

In [23]:
# !pip install --upgrade pip
# !pip install google-cloud-aiplatform
# !pip install scikit-learn
# !pip install mplcursors
# !pip install ipympl
!pip install pyarrow


Defaulting to user installation because normal site-packages is not writeable
[0mCollecting pyarrow
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[0mInstalling collected packages: pyarrow
[0mSuccessfully installed pyarrow-17.0.0


In [49]:
# %cd ..
# !ls
!source llm-venv/bin/activate

## Setup GCP and VertexAI

In [1]:
import json
with open("../resources/config.json") as f:
    config = json.load(f)
    

In [2]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

In [3]:
# Path to your service account key file
key_path = "../resources/keys.json" #Path to the json key associated with your service account from google cloud

In [4]:
# Create credentials object

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

In [5]:
# Connect to vertexAI
import vertexai

PROJECT_ID = config['gcp']['project_id']
REGION = config['gcp']['region']

# initialize vertex
vertexai.init(project = PROJECT_ID, location = REGION, credentials = credentials)

## Get start with text embeddings

In [10]:
from vertexai.language_models import TextEmbeddingModel

In [11]:
# load the model

embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko")

In [8]:
# Generate word embeddigs
# The returned object is a list with a single TextEmbedding object.
# The TextEmbedding.values field stores the embeddings in a Python list.

embedding = embedding_model.get_embeddings(
    ["life"]
)

In [10]:
# The TextEmbedding.values field stores the embeddings in a Python list.
vector = embedding[0].values

print(f"Length of vector = {len(vector)}")
print(vector[:10])

Length of vector = 768
[0.013755870051681995, -0.03912805765867233, -0.008546194061636925, -0.02595570497214794, 0.01972811669111252, -0.014042137190699577, -0.006056010257452726, -0.010783486068248749, -0.0036132442764937878, -0.00030527994385920465]


In [22]:
# Generate sentence embeddings
sen1 = ["what computer system"]
sen2 = ["Explain life"]

emb1 = embedding_model.get_embeddings(sen1)
emb2 = embedding_model.get_embeddings(sen2)

In [23]:
# compare two embeddings
from sklearn.metrics.pairwise import cosine_similarity

# first get the vectors
vec1 = [emb1[0].values]
vec2 = [emb2[0].values]

In [24]:
print(cosine_similarity(vec1,vec2))

[[0.55529533]]


## Understand Embeddings

## Visualizing Embeddings

In [12]:
in_1 = "Missing flamingo discovered at swimming pool"

in_2 = "Sea otter spotted on surfboard by beach"

in_3 = "Baby panda enjoys boat ride"


in_4 = "Breakfast themed food truck beloved by all!"

in_5 = "New curry restaurant aims to please!"


in_6 = "Python developers are wonderful people"

in_7 = "TypeScript, C++ or Java? All are great!" 


input_text_lst_news = [in_1, in_2, in_3, in_4, in_5, in_6, in_7]

In [13]:
import numpy as np

In [14]:
embeddings = []
for inp_text in input_text_lst_news:
    emb = embedding_model.get_embeddings(
        [inp_text])[0].values
    embeddings.append(emb)

embedding_array = np.array(embeddings)

In [15]:
print("Shape : " + str(embedding_array.shape))
print(embedding_array)

Shape : (7, 768)
[[ 0.00848222  0.00836626 -0.02967997 ...  0.00860982 -0.0410819
   0.00256639]
 [-0.0551771  -0.01340853 -0.03359561 ... -0.01553276  0.00358049
  -0.0037684 ]
 [-0.03348316  0.00433433 -0.02988683 ... -0.00524339  0.02533281
   0.00336217]
 ...
 [ 0.04009016 -0.05315267 -0.05048409 ...  0.02780198 -0.02067361
   0.01399049]
 [-0.01205887  0.00141472 -0.02448857 ...  0.02541188 -0.07642936
   0.06357682]
 [ 0.01457254 -0.02650255 -0.05243243 ...  0.00474664 -0.01937744
   0.00399636]]


In [16]:
# Reduce the high dim data to low dim using PCA
# Perform PCA for 2D visualization

from sklearn.decomposition import PCA

PCA_model = PCA(n_components = 2)
PCA_model.fit(embedding_array)
new_values = PCA_model.transform(embedding_array)

In [17]:
print("Shape: " + str(new_values.shape))
print(new_values)

Shape: (7, 2)
[[ 0.37876705 -0.04326331]
 [ 0.36152641 -0.25299494]
 [ 0.33130998 -0.08953431]
 [-0.16780305  0.39842445]
 [-0.02711076  0.483813  ]
 [-0.4234599  -0.25245445]
 [-0.45322972 -0.24399045]]


## Applications of Embeddings

In [6]:
from google.cloud import bigquery
import pandas as pd

In [7]:
def run_bq_query(sql):

    # Create BQ client
    bq_client = bigquery.Client(project = PROJECT_ID, 
                                credentials = credentials)

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, 
                                         use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, 
                                    job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

In [8]:
# define list of programming language tags we want to query

language_list = ["python", "html", "r", "css"]

In [9]:
so_df = pd.DataFrame()

for language in language_list:
    
    print(f"generating {language} dataframe")
    
    query = f"""
    SELECT
        CONCAT(q.title, q.body) as input_text,
        a.body AS output_text
    FROM
        `bigquery-public-data.stackoverflow.posts_questions` q
    JOIN
        `bigquery-public-data.stackoverflow.posts_answers` a
    ON
        q.accepted_answer_id = a.id
    WHERE 
        q.accepted_answer_id IS NOT NULL AND 
        REGEXP_CONTAINS(q.tags, "{language}") AND
        a.creation_date >= "2020-01-01"
    LIMIT 
        500
    """

    
    language_df = run_bq_query(query)
    language_df["category"] = language
    so_df = pd.concat([so_df, language_df], 
                      ignore_index = True) 

generating python dataframe




Finished job_id: da15a0be-0a9b-49d8-8059-152a32aab0da
generating html dataframe




Finished job_id: 541757bb-6a64-4d7f-8f6f-4d7988713114
generating r dataframe




Finished job_id: c867a7bb-c207-40b8-959e-cfa834c48ce5
generating css dataframe




Finished job_id: 995d0063-dadd-4145-b7e9-1ec5ae16a912


In [10]:
so_df

Unnamed: 0,input_text,output_text,category
0,How to scrape a webpage which has login if we ...,<p>It is usual for web sites to provide pre-po...,python
1,Kivy Slider Touch Input Area Too Large<p>I am ...,"<p>If anyone is still looking for the answer, ...",python
2,What's the difference between Numpy's Structur...,"<p>From <a href=""https://numpy.org/doc/stable/...",python
3,calculate number of non-missing counts in spec...,<pre><code>df['Cnt'] = (\n df.filter(regex=...,python
4,Addition Django Channels to DRF<p>I'm tryna ad...,<p>Eventually I changed to <code>channels_redi...,python
...,...,...,...
1995,Loading in Images via jQuery prepend But After...,<p>You can use the jQuery <code>load</code> ev...,css
1996,What is this popup thing when you hover over s...,"<p>It's called a CSS Tooltip, and it is &quot;...",css
1997,Text is blocked by element on Html page when u...,<p>You can solve this by giving <code>padding-...,css
1998,How to set X position of a flexbox in CSS?<p>a...,"<p>First, if you write the same rule for the s...",css


In [34]:
so_df.to_csv('sample_data/so_database.csv')

In [21]:
# Generate embeddings
# To generate embeddings for a dataset of texts, we'll need to group the sentences together in batches and send batches of texts to the model.
# The API currently can take batches of up to 5 pieces of text per API call.

from vertexai.language_models import TextEmbeddingModel

In [22]:
# load the model

embedding_model = TextEmbeddingModel.from_pretrained(
    "textembedding-gecko")

In [23]:
import time
import numpy as np

In [24]:
# Generator function to yield batches of sentences

def generate_batches(sentences, batch_size = 5):
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]

In [25]:
so_questions = so_df[0:200].input_text.tolist() 
batches = generate_batches(sentences = so_questions)

In [26]:
batch = next(batches)
len(batch)

5

In [27]:
# Get embeddings on a batch of data
def encode_texts_to_embeddings(sentences):
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception:
        return [None for _ in range(len(sentences))]

In [28]:
batch_embeddings = encode_texts_to_embeddings(batch)

## QA system using Semantic Search

In [36]:
so_database = pd.read_csv('sample_data/so_database.csv')

In [37]:
print("Shape: " + str(so_database.shape))
print(so_database)

Shape: (2000, 4)
      Unnamed: 0                                         input_text  \
0              0  How to scrape a webpage which has login if we ...   
1              1  Kivy Slider Touch Input Area Too Large<p>I am ...   
2              2  What's the difference between Numpy's Structur...   
3              3  calculate number of non-missing counts in spec...   
4              4  Addition Django Channels to DRF<p>I'm tryna ad...   
...          ...                                                ...   
1995        1995  Loading in Images via jQuery prepend But After...   
1996        1996  What is this popup thing when you hover over s...   
1997        1997  Text is blocked by element on Html page when u...   
1998        1998  How to set X position of a flexbox in CSS?<p>a...   
1999        1999  Change labels text color when input is active<...   

                                            output_text category  
0     <p>It is usual for web sites to provide pre-po...   pytho

In [38]:
# load question embeddings
import numpy as np
from utils import encode_text_to_embedding_batched

ImportError: cannot import name 'encode_text_to_embedding_batched' from 'utils' (/home/younis/.local/lib/python3.10/site-packages/utils/__init__.py)