# Creating the dataset

In [None]:
from seleniumbase import Driver

driver = Driver(uc=True, headless = True)

# Extraction from Facebook

In [None]:
# Logginned the browser with personal ID, and aim is to scrap Human Comments from public posts.
from fb_utils import *

post_links = []

posts = driver.find_element(
    'xpath',
    '/html/body/div[1]/div/div[1]/div/div[3]/div/div/div[1]/div[1]/div/div[2]/div/div/div/div[3]/div/div[3]/div/div[2]') \
                                                                                     .find_elements('tag name', 'div')[13:] # starting 13 are not posts

for post in posts:
    l = get_link(post)
    post_links.append(l)

save_to_file(post_links, 'post_links_fb.pkl')

facebook_comments = {}
for post_link in post_links:
    driver.open(post_link)
    driver.wait(1)
    driver.scroll_to_bottom()
    driver.wait(1)
    driver.scroll_to_top()
    driver.wait(1)

    title = driver.find_element('tag name', 'title').text
    comments = driver.find_elements('xpath', '//div[@class="kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x c1et5uql"]')
    for comment in comments:
        facebook_comments[title] = facebook_comments.get(title, []) + [comment.text] + ['\n']
    
save_to_file(facebook_comments, 'facebook_comments.pkl')

# Extraction from Reddit

In [None]:
communities = ['r/']

post_links = []
for community_link in communities:
    driver.get(community_link)
    par = driver.find_element('xpath', '//*[@id="main-content"]/div[2]/shreddit-feed').find_elements('tag name', 'article')
    
    l = par.find_element('tag name', 'a').get_attribute('href')
    post_links.append(l)

save_to_file(post_links, 'post_links_reddit.pkl')

reddit_comments = {}
for post_link in post_links:
    driver.get(post_link)
    title = driver.find_element('xpath', '//*[@id="post-title-t3_1fdyleo"]').text
    content = driver.find_element('xpath', '//*[@id="t3_1fdyleo"]/div[2]').text
    
    comments = driver.find_elements('tag name', 'shreddit-comment')
    for comment in comments:
        t = comment.text
        reddit_comments[title] = reddit_comments.get(title, []) + [t] + ['\n']

save_to_file(reddit_comments, 'reddit_comments.pkl')

# Extraction from twitter

In [None]:
import pickle
import time
from twitter_utils import *

# Step 1: Scrape tweet links
driver.get("https://twitter.com/search?q=public%20tweets&src=typed_query")

post_links = []
tweets = scroll_and_collect_tweets(driver)

for tweet in tweets:
    link = get_tweet_link(tweet)
    if link:
        post_links.append(link)

# Save the tweet links to a file
save_to_file(post_links, 'post_links_twitter.pkl')

# Step 2: Scrape comments from each tweet
twitter_comments = {}

for post_link in post_links:
    driver.get(post_link)
    time.sleep(3)  # Allow time for the page to load
    # Get the tweet title/content
    try:
        title = driver.find_element('xpath', '//div[@data-testid="tweetText"]').text
    except Exception as e:
        print(f"Error getting tweet text: {e}")
        continue

    # Get all replies (tweets under the main tweet)
    try:
        replies = driver.find_elements('xpath', '//div[@data-testid="reply"]')
        for reply in replies:
            reply_text = reply.text
            twitter_comments[title] = twitter_comments.get(title, []) + [reply_text] + ['\n']
    except Exception as e:
        print(f"Error getting replies: {e}")

# Save the collected comments to a file
save_to_file(twitter_comments, 'twitter_comments.pkl')


# Combining to create the dataset

In [None]:
import pickle
import random
import pandas as pd
from utils import *

# Load extracted data from Facebook, Reddit, and Twitter
facebook_comments = load_data('facebook_comments.pkl')
reddit_comments = load_data('reddit_comments.pkl')
twitter_comments = load_data('twitter_comments.pkl')

# Create the dataset in the required format
create_dataset_format(facebook_comments, 'facebook_language_model_dataset.txt')
create_dataset_format(reddit_comments, 'reddit_language_model_dataset.txt')
create_dataset_format(twitter_comments, 'twitter_language_model_dataset.txt')

# Optional: Combine datasets into one file if needed
def combine_datasets(output_file, *datasets):
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for dataset in datasets:
            with open(dataset, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())

# Combine all datasets into one (optional step)
combine_datasets('combined_language_model_dataset.txt',
                 'facebook_language_model_dataset.txt',
                 'reddit_language_model_dataset.txt',
                 'twitter_language_model_dataset.txt')

# Load the dataset from the combined file
dataset = load_dataset_from_file('combined_language_model_dataset.txt')

# Save the dataset to a CSV file
save_dataset_to_csv(dataset, 'language_model_dataset.csv')

Dealing with second part of the Dataset

# Chat Finetuning

In [None]:
import pandas as pd
from Datasets import Dataset

df = pd.read_csv('https://raw.githubusercontent.com/Deepakchawla/streamlit-example/master/cleaned_dataset_personality.csv')
df.head()
ds = Dataset.from_pandas(df)
ds = ds.remove_columns('Unnamed: 0')
ds[0]

# Defining GPT-LLM object using Langchain

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate
import os

BASE_URL = os.environ["AZURE_OPENAI_ENDPOINT"]
DEPLOYMENT_NAME= "gpt4"
API_KEY = os.environ["AZURE_OPENAI_API_KEY"] 


llm = AzureChatOpenAI(azure_deployment=DEPLOYMENT_NAME,
                      model_name = "gpt-4",
                      openai_api_version="2024-02-01",
                #       temperature=0.7,
                      openai_api_key=API_KEY,
                      azure_endpoint=BASE_URL)

# Setting the prompt for the Enhancing the dataset using GPT

In [None]:

prompt = """This is the basic personality setting of user 2:

        {user2_personality}


        This is the conversation between user 1 and user 2:
        
        {conversation}


        After analyzing the above information your task is to give a detailed personality of user2, along with some examples of what he would do in various situations you have inferred from his personality.
        Use a suitable random name for user 2 in your answer, and replace that name with every occurrence of user 2, 
        You can create fictional data about the personality of user 2 which does not conflict with the provided conversation.
        You are supposed to give the answer in a format such that you are explaining the personality of user 2 to me. You can NOT include user 1 in user 2's personality traits.
        Just include the facts, the habits and how the user 2 would behave in different situations. Make them very descriptive.
        Add any missing  or additional fictional details which make the final answer even better and revealing about user 2.

        You are only supposed to give the character sketch of user 2 in your answer with the chosen name, do not mention user1 or user2 in the answer. Don't give any other information as the output. your output should be directly usable as a character sketch of the person without any additional details provided. You are giving a model answer and not talking with the user, you just have to give the answer without conversing. Where you directly start with the character sketch with only facts about user 2, without ever mentioning the gfact that user 2's name has been changed.
        You are telling me the personality of user 2 in a very formal manner where we dont do uneccessary talking and only talk about facts. You just directly give the facts to me without any pleasentries. For example you just start with Alex is a this... or you start with Mary is like this...
        """

prompt = PromptTemplate(template=prompt, input_variables=["user2_personality", "conversation"] )

llm_chain = prompt | llm

In [None]:
import concurrent.futures
from tqdm import tqdm
import os
import pickle


def infer(example):
    try:
        example['personality'] = llm_chain.invoke({'user2_personality': example['user 2 personas'], 'conversation': example['Best Generated Conversation']}).content
    except:
        example['personality'] = None
    return example

super_results = []
with open(r"C:\Users\chait\Downloads\super_res\sup_res.pkl", 'rb') as f:
    super_results = pickle.load(f)

BATCH_SIZE = 80
for i in range(0,len(ds),BATCH_SIZE):
    temp_ds = ds.select(list(range(i, i+BATCH_SIZE))).to_list()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        results = list(tqdm(executor.map(infer, temp_ds), total=len(temp_ds), desc=f"Processing batch {i}"))
        
    super_results.extend(results)
    
    save_dir = r'C:\Users\chait\Downloads\super_res'
    os.makedirs(save_dir, exist_ok=True)
    with open(save_dir+'\sup_res.pkl', 'wb') as file:
        pickle.dump(super_results, file)

In [None]:
def convo(example):
    import random
    from langchain.chat_models import AzureChatOpenAI
    from langchain import PromptTemplate
    from datasets import load_dataset, Dataset
    import re
    
    names = [
        "James", "Michael", "Robert", "John", "David", "William", "Richard", "Joseph", "Thomas", "Christopher",
        "Charles", "Daniel", "Matthew", "Anthony", "Mark", "Donald", "Steven", "Andrew", "Paul", "Joshua",
        "Kenneth", "Kevin", "Brian", "Timothy", "Ronald", "George", "Jason", "Edward", "Jeffrey", "Ryan",
        "Jacob", "Nicholas", "Gary", "Eric", "Jonathan", "Stephen", "Larry", "Justin", "Scott", "Brandon",
        "Benjamin", "Samuel", "Gregory", "Alexander", "Patrick", "Frank", "Raymond", "Jack", "Dennis", "Jerry",
        "Tyler", "Aaron", "Jose", "Adam", "Nathan", "Henry", "Zachary", "Douglas", "Peter", "Kyle", "Noah",
        "Ethan", "Jeremy", "Christian", "Walter", "Keith", "Austin", "Roger", "Terry", "Sean", "Gerald",
        "Carl", "Dylan", "Harold", "Jordan", "Jesse", "Bryan", "Lawrence", "Arthur", "Gabriel", "Bruce",
        "Logan", "Billy", "Joe", "Alan", "Juan", "Elijah", "Willie", "Albert", "Wayne", "Randy", "Mason",
        "Vincent", "Liam", "Roy", "Bobby", "Caleb", "Bradley", "Russell", "Lucas",
        "Mary", "Patricia", "Jennifer", "Linda", "Elizabeth", "Barbara", "Susan", "Jessica", "Karen", "Sarah",
        "Lisa", "Nancy", "Sandra", "Betty", "Ashley", "Emily", "Kimberly", "Margaret", "Donna", "Michelle",
        "Carol", "Amanda", "Melissa", "Deborah", "Stephanie", "Rebecca", "Sharon", "Laura", "Cynthia", "Dorothy",
        "Amy", "Kathleen", "Angela", "Shirley", "Emma", "Brenda", "Pamela", "Nicole", "Anna", "Samantha",
        "Katherine", "Christine", "Debra", "Rachel", "Carolyn", "Janet", "Maria", "Olivia", "Heather", "Helen",
        "Catherine", "Diane", "Julie", "Victoria", "Joyce", "Lauren", "Kelly", "Christina", "Ruth", "Joan",
        "Virginia", "Judith", "Evelyn", "Hannah", "Andrea", "Megan", "Cheryl", "Jacqueline", "Madison", "Teresa",
        "Abigail", "Sophia", "Martha", "Sara", "Gloria", "Janice", "Kathryn", "Ann", "Isabella", "Judy",
        "Charlotte", "Julia", "Grace", "Amber", "Alice", "Jean", "Denise", "Frances", "Danielle", "Marilyn",
        "Natalie", "Beverly", "Diana", "Brittany", "Theresa", "Kayla", "Alexis", "Doris", "Lori", "Tiffany"
    ]
    prompt = """Here is the character sketch 

    {sketch}

    Tell me the name of the character whom we are talking about. just tell me the name and nothing else for example if the name is Jack you just say Jack. if the name is Mary Smith you just say Mary, just return the fist name of the character."""
    prompt = PromptTemplate(template=prompt, input_variables=['sketch'] )
    llm_chain = prompt | llm
    
    
    t = example['Best Generated Conversation']
    user1_name = random.choice(names)
    user2_name = llm_chain.invoke({'sketch' : example['personality']}).content
    
    def replace_user_names(match):
        if "1" in match.group(0):
            return user1_name
        elif "2" in match.group(0):
            return user2_name
        return match.group(0)

    t = re.sub(r"\[.*?\]|\(.*?\)\{.*?\}", replace_user_names, t)
    t = t.split(" User 1: ")
    t[0] = t[0].replace('User 1: ', '')
    # display(t)
    for i in range(len(t)):
        x = t[i].split(' User 2: ')
        # display(x)
        if len(x)==1:
            t = t[:-1]
            break
        t[i] = {"user" : x[0], "response" : x[1]}
    example['conversation'] = t
    example['names'] = {'user1': user1_name, 'user2': user2_name}
    return example

dataset = dataset.map(convo, num_proc = 50)
dataset

In [None]:
import pandas as pd
import pickle
from datasets import Dataset
with open(r"C:\Users\chait\Downloads\dataset.pkl", 'rb') as f:
    ds = pickle.load(f)

df = pd.DataFrame(ds)
df.dropna(inplace=True)
df = df.reset_index(drop=True)
ds

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain import PromptTemplate
from langchain_core.output_parsers.json import JsonOutputParser
from tqdm import tqdm
import re


def names(row):
    prompt = """I need to extract the names of 'user 1' and 'user 2' from the text, which are enclosed within triple curly brackets like this
    '''{value_1}'''. The format should be a JSON object with two key-value pairs. 
    user1: user 1 name and user2 : user 2 name. 

    please output the information in structured JSON format without using markdown code blocks."""
    prompt = PromptTemplate(template=prompt, input_variables=["value_1"] )
    parser = JsonOutputParser()
    llm_chain = prompt | llm
    t = llm_chain.invoke({'value_1': str(row['conversation'])}).content
    t = re.search(r'\{(.*)\}', t, re.DOTALL).group(1)
    t = parser.invoke(t)
    return t

df['names'] = df.progress_apply(names, axis=1)
df

In [None]:
# Chat Template
# <|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>
# <|start_header_id|>user<|end_header_id|>\n\n{user_message}<|eot_id|>
# <|start_header_id|>assistant<|end_header_id|>\n\n{assistant_message}<|eot_id|>

system_prompt = "You are {user2_name}. Your role is to engage in conversation with {user1_name}, embodying the personality traits outlined in:\n\n{personality}\n\nRespond to {user1_name} according to these characteristics as the conversation unfolds."


def complete(example):
    bos_token = "<|begin_of_text|>"
    start_header = "<|start_header_id|>"
    end_header = "<|end_header_id|>"
    eos_token = "<|eot_id|>"
    t = bos_token + start_header + 'system' + end_header + '\n\n' + system_prompt.format(user2_name=example['names']['user2'], user1_name=example['names']['user1'], personality=example['personality'])+eos_token
    for ex in example['conversation']:
        ex = list(ex.values())
        t += start_header + "user" + end_header + '\n\n' + ex[0] + eos_token
        t += start_header + "assistant" + end_header + '\n\n' + ex[1] + eos_token
    return {'text': t}

final_ds = dataset.map(complete)
final_ds

In [None]:
final_ds.save_to_disk('dataset')