In [56]:
import os
import openai
import pandas as pd
import importlib
import time 
import re


# Process
```mermaid
graph TD
    A(LinkedIn Export) -->|Python| B(Cleaned Posts)
    B -->|OpenAI CLI| C(JSON Format Posts)
    C -->|Curie| D(Summaries)
    B -->|Python| E(Merged Posts + Summaries)
    D -->|Python| E(Merged Posts + Summaries)
    E -->|OpenAI CLI| F(JSON Post + Summaries)
    F -->|Davinci fine tuning| G(Fine Tuned Model)
```

In [2]:
import src.utils.DataCleaning as dc


In [73]:
importlib.reload(dc)
#testing
shares = dc.loadLinkedInSharesData('./src/local_data/shares.csv')
shares.head()

Unnamed: 0,Date,ShareLink,ShareCommentary,SharedURL,MediaURL,Visibility
0,12/1/2022 23:00,https://www.linkedin.com/feed/update/urn%3Ali%...,"If the VA wants to hire laid off tech workers,...",,,MEMBER_NETWORK
1,11/23/2022 16:42,https://www.linkedin.com/feed/update/urn%3Ali%...,If you're dealing with zip codes in your data ...,,,MEMBER_NETWORK
2,11/22/2022 15:52,https://www.linkedin.com/feed/update/urn%3Ali%...,"How to create content when you have #ADHD:""\r\...",,,MEMBER_NETWORK
3,11/22/2022 14:54,https://www.linkedin.com/feed/update/urn%3Ali%...,"Unless content creation is your career, being...",,,MEMBER_NETWORK
4,11/18/2022 20:41,https://www.linkedin.com/feed/update/urn%3Ali%...,I DIDN'T REALIZE THE QUOTE WAS FROM THE BANKRU...,,,MEMBER_NETWORK


In [77]:
shares = shares[shares["ShareCommentary"].str.len() > 10]

In [64]:
def remove_url(post):
    no_url_post = re.sub(r'((https|http)?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*','', post)
    return(no_url_post)

In [84]:
#build workflow for cleaning
##only need ShareCommentary field
sample_posts = pd.DataFrame(columns=['prompt', 'completion'])
sample_posts['completion'] = shares['ShareCommentary'].apply(lambda x: remove_url(x) )
sample_posts['completion'] = sample_posts['completion'].str.translate(str.maketrans('', '', '",:')).str.strip()


## output has strange double quotes around each line, eventually we can try to identify intended quotes, but for now, we'll get rid of all of them


In [85]:
sample_posts['completion'][130]

"I feel it is a good decision for fb to branch into physical devices; however oculus' core product offering (games) has been developed by the open source community. Now we ate already seeing a developer back lash.  Do you think FB will regret this acquisition?"

In [86]:
# Now a series of 'prompts' asking to summarize the linked in posts as a chatgpt3 prompt.

summarize_prefix = "Give a one sentence summary of the following linkedin post: \r\n "

summary_posts = summarize_prefix + sample_posts['completion'] 


In [87]:
summary_posts[1]

"Give a one sentence summary of the following linkedin post: \r\n If you're dealing with zip codes in your data analysis here's a quick video that goes over the process of treating numbers like text. \r\n\r\n#licreatoraccelerator #linkedincreator #linkedincreatoraccelerator #datanalysis #datanalytics #yaads #datascience"

In [88]:
#Create your own local directory and store the key file from openai
key = open("./.config/openai.key").read()

In [89]:
# Load your API key from an environment variable or secret management service
openai.api_key = key

# send a Completion request to count to 100

# def curie_summarize(p):
response = openai.Completion.create(
    model='text-davinci-003',
    prompt= summary_posts[1], #p,
    max_tokens=100,
    temperature=0,
    stream=False,  # this time, we set stream=True
)
    # return(response)






In [93]:
# # create variables to collect the stream of events
collected_prompts = []
# # iterate through posts
for post in summary_posts:
    # print(post)
    response = openai.Completion.create(
        model='text-davinci-003',
        prompt= post.strip(), #p,
        max_tokens=500,
        temperature=0,
        stream=False,  # this time, we set stream=True
    )
#     res = curie_summarize(f'"{post}"')
    collected_prompts.append(response["choices"][0]["text"].strip())  # save the response

In [95]:
sample_posts['prompt'] = collected_prompts


In [None]:
sample_posts.to_csv("./src/local_data/shares2_clean.csv")

In [96]:
sample_posts.to_json("./src/local_data/shares2.jsonl", orient='records', lines=True)

# Variations by platform
Powershell
`$env:OPENAI_API_KEY='{key}' ; openai tools fine_tunes.prepare_data -f ./src/local_data/shares.jsonl -q`

Windows Command Line
`!set OPENAI_API_KEY={key} && openai tools fine_tunes.prepare_data -f ./src/local_data/shares.jsonl -q`

Linux Bash/Shell
`export OPENAI_API_KEY='{key}' ; openai tools fine_tunes.prepare_data -f ./src/local_data/shares.jsonl -q`

In [None]:
!set OPENAI_API_KEY={key} && openai tools fine_tunes.prepare_data -f ./src/local_data/shares2.jsonl -q

In [None]:
!set OPENAI_API_KEY={key} && openai api fine_tunes.create -t ./src/local_data/shares2_prepared.jsonl -m davinci