## Imports

In [45]:
import featureform as ff
from featureform import local

import openai

client = ff.Client(local=True)

### Data Processing

In [46]:
chapters = local.register_directory(
    name="dpv-chapters",
    path="170/data/files",
    description="Text from DPV Chapters",
)

In [47]:
ed_posts = local.register_directory(
    name='new170-edstem',
    path='170/data/edstem',
    description='170 Posts from edstem',
)

In [48]:
client.dataframe(chapters)

Applying Run: tender_shirley
Creating provider local-mode 
Creating source dpv-chapters  tender_shirley
Creating source new170-edstem  tender_shirley


Unnamed: 0,filename,body
0,chap2.csv,Chapter⌘Page⌘Text\n2⌘0⌘Chapter 2 Divide-and-co...
1,chap3.csv,Chapter⌘Page⌘Text\n3⌘0⌘Chapter 3 Decomposition...
2,chap7.csv,Chapter⌘Page⌘Text\n7⌘0⌘Chapter 7 Linear progra...
3,chap6.csv,Chapter⌘Page⌘Text\n6⌘0⌘Chapter 6 Dynamic progr...
4,chap4.csv,Chapter⌘Page⌘Text\n4⌘0⌘Chapter 4 Paths ingraph...
5,chap5.csv,Chapter⌘Page⌘Text\n5⌘0⌘Chapter 5 Greedy algori...
6,chap8.csv,Chapter⌘Page⌘Text\n8⌘0⌘Chapter 8 NP-complete p...
7,chap9.csv,Chapter⌘Page⌘Text\n9⌘0⌘Chapter 9 Coping with N...


In [49]:
client.dataframe(ed_posts)

Applying Run: tender_shirley
Creating provider local-mode 


Unnamed: 0,filename,body
0,new170_edstem.csv,"PK⌘Text\n1091⌘""""""[Fall 2023] Apply for CS 170 ..."


In [50]:
# turns every csv file into a dataframe, then concatenates all the dataframes together

@local.df_transformation(inputs=[chapters])
def process_chapter_files84(dir_df):
    from io import StringIO
    import pandas as pd
    
    chapter_dfs = []
    for i, row in dir_df.iterrows():
        print(row[0])
        
        csv_str = StringIO(row[1])
        r_df = pd.read_csv(csv_str, sep="⌘")
        r_df["filename"] = row[0]
        print(r_df.head())
        chapter_dfs.append(r_df)

    return pd.concat(chapter_dfs)

In [51]:
# verify
df = client.dataframe(process_chapter_files84)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source process_chapter_files84  tender_shirley
chap2.csv
   Chapter  Page                                               Text   filename
0        2     0  Chapter 2 Divide-and-conquer algorithms Thediv...  chap2.csv
1        2     1  56 Algorithms Let'smove awayfrom complex numbe...  chap2.csv
2        2     2  S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...  chap2.csv
3        2     3  58 Algorithms Figure 2.2Divide-and-conquer int...  chap2.csv
4        2     4  S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...  chap2.csv
chap3.csv
   Chapter  Page                                               Text   filename
0        3     0  Chapter 3 Decompositions ofgraphs 3.1 Why grap...  chap3.csv
1        3     1  92 Algorithms Figure 3.1(a)Amap and(b)itsgraph...  chap3.csv
2        3     2  S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...  chap3.csv
3        3     3  94 Algorithms Figure 3.2Exploring agraph israt...  chap3.



Unnamed: 0,Chapter,Page,Text,filename
0,2,0,Chapter 2 Divide-and-conquer algorithms Thediv...,chap2.csv
1,2,1,56 Algorithms Let'smove awayfrom complex numbe...,chap2.csv
2,2,2,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",chap2.csv
3,2,3,58 Algorithms Figure 2.2Divide-and-conquer int...,chap2.csv
4,2,4,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",chap2.csv


In [52]:
@local.df_transformation(inputs=[ed_posts])
def process_edstem_files94(ed_df):
    from io import StringIO
    import pandas as pd

    csv_str = StringIO(ed_df.loc[0, "body"])
    r_df = pd.read_csv(csv_str, sep="⌘")
    print(r_df.head())

    return r_df

In [53]:
# verify
df = client.dataframe(process_edstem_files94)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source process_edstem_files94  tender_shirley
     PK                                               Text
0  1091  "[Fall 2023] Apply for CS 170 Course Staff! He...
1  1075  "Grading Update. Hi all, We've noticed some in...
2  1046  """Final Exam, HW12, HW13 Grades Published Upd...
3  1026  "[grades NOT published yet] Regrade Logistics ...
4   914  "Scam email from me. Hi all I've gotten severa...




Unnamed: 0,PK,Text
0,1091,"""[Fall 2023] Apply for CS 170 Course Staff! He..."
1,1075,"""Grading Update. Hi all, We've noticed some in..."
2,1046,"""""""Final Exam, HW12, HW13 Grades Published Upd..."
3,1026,"""[grades NOT published yet] Regrade Logistics ..."
4,914,"""Scam email from me. Hi all I've gotten severa..."


In [54]:
# entity ID transformation
@local.df_transformation(inputs=[process_chapter_files84])
def excerpt_preprocess_df(chapter_df):
    # adding a unique identifier for every column
    chapter_df["PK"] = chapter_df.apply(lambda row: f"{row['Chapter']}_{row['Page']}", axis=1)
    
    # source column
    chapter_df["Source"] = ["Textbook"] * len(chapter_df)
    
    # more pre-processing - making columns generic across sources
    chapter_df = chapter_df[["PK", "Text", "Source"]]
    return chapter_df

In [55]:
# verify excerpts
df = client.dataframe(excerpt_preprocess_df)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source excerpt_preprocess_df  tender_shirley


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)


Unnamed: 0,PK,Text,Source
0,2_0,Chapter 2 Divide-and-conquer algorithms Thediv...,Textbook
1,2_1,56 Algorithms Let'smove awayfrom complex numbe...,Textbook
2,2_2,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook
3,2_3,58 Algorithms Figure 2.2Divide-and-conquer int...,Textbook
4,2_4,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook


In [56]:
# entity ID transformation
@local.df_transformation(inputs=[process_edstem_files94])
def post_preprocess_pf5(post_df):  
    # pk must be string
    print(post_df)
    post_df["PK"] = [str(pk) + "e" for pk in post_df["PK"]]

    # source column
    post_df["Source"] = ["Edstem"] * len(post_df)
    
    # more pre-processing - making columns generic across sources
    post_df = post_df[["PK", "Text", "Source"]]
    return post_df

In [57]:
# verify ed posts
df = client.dataframe(post_preprocess_pf5)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source post_preprocess_pf5  tender_shirley
      PK                                               Text
0   1091  "[Fall 2023] Apply for CS 170 Course Staff! He...
1   1075  "Grading Update. Hi all, We've noticed some in...
2   1046  """Final Exam, HW12, HW13 Grades Published Upd...
3   1026  "[grades NOT published yet] Regrade Logistics ...
4    914  "Scam email from me. Hi all I've gotten severa...
5    871  "Final Review Sessions + Final Logistics. Hi E...
6      2  "Welcome to CS170! We would like to welcome yo...
7     16  "Discussion Schedule Now Available! Greetings!...
8     38  "OH / Homework Party Schedule. Greetings! Here...
9    130  "Homework Solutions. Every week, after the hom...
10   133  "[UPDATE] Changes to Discussion Schedule. Hi e...
11   155  "EdStem Etiquette and Coding OH. Hi all, Thank...
12   176  "Local Setup for Coding Problems. First time s...
13   191  "Additional Discussion Sections. Due to

Unnamed: 0,PK,Text,Source
0,1091e,"""[Fall 2023] Apply for CS 170 Course Staff! He...",Edstem
1,1075e,"""Grading Update. Hi all, We've noticed some in...",Edstem
2,1046e,"""""""Final Exam, HW12, HW13 Grades Published Upd...",Edstem
3,1026e,"""[grades NOT published yet] Regrade Logistics ...",Edstem
4,914e,"""Scam email from me. Hi all I've gotten severa...",Edstem


In [58]:
# COMBINE TABLES ONCE WE HAVE MADE THEM GENERIC

@local.df_transformation(inputs=[excerpt_preprocess_df, post_preprocess_pf5])
def combine_dfs2(excerpts_df, posts_df):
    import pandas as pd
    return pd.concat([excerpts_df, posts_df])

In [59]:
# verify table join
df = client.dataframe(combine_dfs2)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source combine_dfs2  tender_shirley


Unnamed: 0,PK,Text,Source
0,2_0,Chapter 2 Divide-and-conquer algorithms Thediv...,Textbook
1,2_1,56 Algorithms Let'smove awayfrom complex numbe...,Textbook
2,2_2,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook
3,2_3,58 Algorithms Figure 2.2Divide-and-conquer int...,Textbook
4,2_4,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook


In [60]:
# vectorize each excerpt
@local.df_transformation(inputs=[combine_dfs2])
def vectorize_excerpts(chapter_df):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chapter_df["Text"].tolist())
    chapter_df["Vector"] = embeddings.tolist()
    
    return chapter_df

In [61]:
# verify
df = client.dataframe(vectorize_excerpts)

df.head()

Applying Run: tender_shirley
Creating provider local-mode 
Creating source vectorize_excerpts  tender_shirley




Unnamed: 0,PK,Text,Source,Vector
0,2_0,Chapter 2 Divide-and-conquer algorithms Thediv...,Textbook,"[-0.08655313402414322, 0.006149395368993282, 0..."
1,2_1,56 Algorithms Let'smove awayfrom complex numbe...,Textbook,"[-0.06641228497028351, 0.03119048848748207, -0..."
2,2_2,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook,"[0.0043007563799619675, 0.014555327594280243, ..."
3,2_3,58 Algorithms Figure 2.2Divide-and-conquer int...,Textbook,"[-0.11472944170236588, 0.040149156004190445, -..."
4,2_4,"S.Dasgupta, C.H.Papadimitriou, andU.V.Vazirani...",Textbook,"[-0.034075021743774414, 0.00882561132311821, 0..."


In [62]:
df.tail()

Unnamed: 0,PK,Text,Source,Vector
35,856e,"""TA Small Group Chatting Sessions + General Ad...",Edstem,"[-0.03165562078356743, 0.011771000921726227, 0..."
36,858e,"""Extra Credit Opportunity. Hi all, We would gr...",Edstem,"[-0.046319637447595596, 0.007366633974015713, ..."
37,890e,"""Final room update. Hello, There has been an u...",Edstem,"[0.021930553019046783, 0.02415216527879238, 0...."
38,4e,"""Master Index. We will from time to time un-pi...",Edstem,"[-0.11391334980726242, -0.006270380225032568, ..."
39,39e,"""Discussion Resources. Hi everyone! Your discu...",Edstem,"[-0.13333465158939362, -0.03816254809498787, -..."


In [63]:
# creating weaviate instance
# weaviate = ff.register_weaviate(
#     name="weaviate",
#     url="https://ohpt-weaviate-dbzunfhp.weaviate.network",
#     api_key="sA7p3kjpT18vziFPQRSa4eCkNI7QFuh2ixkX"
# )

pinecone = ff.register_pinecone(
    name="pinecone",
    project_id="56ea356",
    environment="asia-southeast1-gcp-free",
    api_key="8566719a-f7bc-4dc8-b38b-14d39d976807",
)

In [64]:
@ff.entity
class Text_String:
    excerpt_embeddings = ff.Embedding(
        vectorize_excerpts[["PK", "Vector"]],
        dims=384,
        vector_db=pinecone,
        description="Embeddings from excerpts of chapters",
        variant="v1"
    )
    excerpts = ff.Feature(
        combine_dfs2[["PK", "Text"]],
        type=ff.String,
        description="Excerpts' original text",
        variant="v1"
    )

In [65]:
!pip install pinecone-client

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [66]:
client.apply()

Applying Run: tender_shirley
Creating provider local-mode 
Creating provider pinecone 
Creating entity text_string 
Creating feature excerpt_embeddings  v1
Creating feature excerpts  v1




ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'content-length': '0', 'date': 'Sat, 16 Sep 2023 02:07:08 GMT', 'server': 'envoy'})


In [43]:
@ff.ondemand_feature(variant="ohpt")
def relevant_excerpts(client, params, entity):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("all-MiniLM-L6-v2")
    search_vector = model.encode(params["query"])
    res = client.nearest("excerpt_embeddings", "v1", search_vector, k=5)
    return res

In [44]:
client.apply()
client.features([("relevant_excerpts", "ohpt")], {}, params={"query": "dynamic programming"})

Applying Run: tender_shirley
Creating provider local-mode 
Creating ondemand_feature relevant_excerpts98  ohpt




array([], shape=(1, 0), dtype=float64)

In [36]:
# creates the improved and contextualized prompt
@ff.ondemand_feature(variant="ohpt")
def contextualized_prompt96(client, params, entity):
    pks = client.features([("relevant_excerpts", "ohpt")], {}, params=params)
    # print(pks)
    prompt = "Use the following pages from our textbook to answer the following question\n"
    for pk in pks[0]:
        prompt += "```"
        # print(client.features([("excerpts", "v1")], {"excerpt": pk}))
        prompt += client.features([("excerpts", "v1")], {"excerpt": pk})[0]
        prompt += "```\n"
    prompt += "Question: "
    prompt += params["query"]
    prompt += "?"
    return prompt

In [37]:
client.apply()
client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": "dynamic programming"})

Applying Run: awesome_archimedes
Creating provider local-mode 
Creating ondemand_feature contextualized_prompt96  ohpt




array(["Use the following pages from our textbook to answer the following question\n```Chapter 6 Dynamic programming Inthepreceding chapters wehaveseen some elegant design principles\x97suc hasdivide-and- conquer ,graph exploration, andgreedy choice\x97that yield de\x02nitive algorithms foravariety ofimportant computational tasks .Thedrawbac kofthese tools isthat they canonly beused onvery speci\x02c types ofproblems .Wenow turn tothetwosledgehammer softhealgorithms craft, dynamic programming andlinear programming ,techniques ofvery broad applicability that canbeinvoked when more specialized methods fail. Predictably ,this generality often comes with acostinef\x02ciency . 6.1 Shortest paths indags, revisited Attheconclusion ofourstudy ofshortest paths (Chapter 4),weobserved that theproblem is especially easy indirected acyclicgraphs (dags). Let'srecapitulate thiscase,because itliesat theheart ofdynamic programming . Thespecial distinguishing feature ofadagisthat itsnodes canbelinearize

# Final Demo

In [51]:
client.apply()
q = "What should I know about dynamic programming to help me prepare for the midterm?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

openai.organization = "org-V70xAGNCjfzw012seLYRWNTJ"
openai.api_key = "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"

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])

Applying Run: awesome_archimedes
Creating provider local-mode 


Answer: Dynamic programming is a technique of very broad applicability, with the goal of iteratively solving one subproblem after the other in order of increasing size. In the context of CS170 Midterm 2, dynamic programming will be used to find the edit distance between two strings, which is the best possible alignment between them. Additionally, applicants should be aware of the concept of a “subproblem” and the property (*) on page 171 which applies to the solving of problems with dynamic programming.


In [52]:
q = "Is Professor John Wright looking for paid Research Assistants?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])





In [58]:
q = "What time are lectures?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])



Answer: Lectures will be held in Lewis 100 every Monday, Wednesday and Friday from 11:00am to 11:59am.


In [54]:
q = "Explain the knapsack algorithm."
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])



The knapsack algorithm is a dynamic programming approach to solving a problem where a given set of items must be placed in a knapsack bag of a certain capacity to produce the maximum value. The algorithm works by breaking the problem down into smaller subproblems, which are then solved in order to find the optimal solution. It starts with the items with the highest value, and then adds each item one by one, until the total weight of the items in the bag reaches or exceeds the capacity of the bag. The algorithm uses a two-dimensional table to keep track of all possible solutions, and in the end finds the combination with the highest value that does not exceed the bag's capacity. It runs in time O(nW), where n is the number of items and W is the capacity of the bag.


In [55]:
q = "What are the importance of qubits?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=500, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])



Answer: Qubits are the building blocks of quantum computing which has the potential to dramatically increase computing power. They are so important because quantum computing is so much faster than classical computing. Qubits allow for exponentially increased storage capacity and computing speed due to their ability to exist in multiple states simultaneously. This also makes quantum cryptography more secure, as it would be nearly impossible to crack code using a quantum computer. Additionally, qubits are also used in quantum sensing, which has the potential to improve the accuracy and sensitivity of measurements, making them invaluable for many scientific experiments.


In [56]:
q = "What are the prerequisites to being prepared for CS170?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])


Answer: The prerequisites to being prepared for CS 170 are a good understanding of union find (lecture on 2/20/2023) and MST (lecture on 2/24/2023). Additionally, for the second midterm, students should be prepared on topics from dynamic programming (lecture on 3/1) up to and including multiplicative weights update (lecture on 3/24).


In [57]:
q = "Will we be compensated for the issue with Midterm 2?"
prompt = client.features([("contextualized_prompt96", "ohpt")], {}, params={"query": q})[0]

print(openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    max_tokens=1000, # The max number of tokens to generate
    temperature=1.0 # A measure of randomness
)["choices"][0]["text"])



Answer: Yes, final grades will be computed based on two separate curves, with the second one giving students the benefit of the doubt to improve their grade.
