### Oversampling influencers users using LLM (Gemini)

In [143]:
import pandas as pd
import textwrap
import google.generativeai as genai
from IPython.display import Markdown
import pandas as pd
import json

def user_row_to_string(user_row):
    """
    Converting user row (all his features) to a string
    """
    user_string = ""
    for key, value in user_row.items():
        if isinstance(value, dict) or isinstance(value, list):
            value_str = str(value)
        else:
            value_str = str(value) if value is not None else "N/A"
        user_string += f"{key}: {value_str}; "
    return user_string


def to_markdown(text):
  # Nice display of the user row data
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

with open('secrets.json', 'r') as file:
    secrets = json.load(file)

    API_KEY = secrets['API_KEY']

# Configuring Gemini API 
genai.configure(api_key=API_KEY)

# Loading the data (after undersampling and preprocessing)
users_df = pd.read_pickle("data/undersampled_df.pickle")  
users_df = users_df[["id", "followers",  "posts", "connections", "experience", "recommendations", "about", "position", "volunteer_experience", "education", "certifications", "languages", "сourses", 'is_influencer']]

In [145]:
# Preparing Influencers examples for the prompt 
influencers = users_df[users_df['is_influencer'] == 1]
influencers_examples = []
for i in range(4):
    influencer = influencers.iloc[i]
    influencers_examples.append(user_row_to_string(influencer))

In [190]:
# Creating new influencers examples 
model = genai.GenerativeModel('gemini-pro')
text_prompt  = f"""
Generate a new example of Linkedin users influencer based on the examples I will give you. The example should:
- Have valid values in all columns.
- Maintain the same structure and format as given examples.
- Exhibit characteristics typical of influencers that you can infer from the given examples, but create new examples that won't be too similar to the given ones.
- connections feature should range between 0 to 500 maximum.
- The label for influencers is 'is_influencer' = 1. So this should be the value for this feature since you are generating influencers.
- Features in generated examples are separated with ';'.
Here are four examples of influencers you should base upon when generating new examples:
    - Example Influencer 1: {influencers_examples[0]}
    - Example Influencer 2: {influencers_examples[1]}
    - Example Influencer 3: {influencers_examples[2]}
    - Example Influencer 4: {influencers_examples[3]}
- Generate the full example within your tokens limit.
"""
examples = []
for i in range(20):
    response = model.generate_content(text_prompt)
    examples.append(response.text)

In [200]:
# Remove examples which the LLM didn't finish to write (don't finish with is_influencer: 1;)
valid_examples = []
for example in examples:
    if example[-2:] == '1;':
        valid_examples.append(example)

In [202]:
import pandas as pd
import ast

def string_to_df_row(input_string):
    """
    Fixing LLM Output and converting the text to pd dataframe row
    """
    input_string = input_string.replace('\\n', '')
    input_string = input_string.replace('[], is_influencer: 1;', '[]; is_influencer: 1;')
    input_string = input_string.replace("'position", "position")
    input_string = input_string.replace(', education', '; education')
    input_string = input_string.replace(', about', '; about')
    input_string = input_string.replace(', languages', '; languages')
    input_string = input_string.replace(', сourses', '; сourses')
    input_string = input_string.replace(', is_influencer', '; is_influencer')
    input_string = input_string.replace(', connections', '; connections')
    input_string = input_string.replace(', posts', '; posts')
    input_string = input_string.replace(', followers', '; followers')
    input_string = input_string.replace(', experience', '; experience')
    input_string = input_string.replace(', volunteer_experience', '; volunteer_experience')
    input_string = input_string.replace(', position', '; position')
    input_string = input_string.replace(', certifications', '; certifications')
    input_string = input_string.replace(", 'education", "; education")
    input_string = input_string.replace("'education", "education")
    input_string = input_string.replace("education'", "education")
    
    input_string = input_string.replace(", 'education'", "; education")
    input_string = input_string.replace(", 'about'", "; about")
    input_string = input_string.replace(", 'languages'", "; languages")
    input_string = input_string.replace(", 'сourses'", "; сourses")
    input_string = input_string.replace(", 'is_influencer'", "; is_influencer")
    input_string = input_string.replace(", is_influencer", "; is_influencer")
    input_string = input_string.replace(", 'connections'", "; connections")
    input_string = input_string.replace(", 'posts'", "; posts")
    input_string = input_string.replace(", 'followers'", "; followers")
    input_string = input_string.replace(", 'experience'", "; experience")
    input_string = input_string.replace(", 'volunteer_experience'", "; volunteer_experience")
    input_string = input_string.replace(", 'position'", "; position")
    input_string = input_string.replace("position'", "position")
    input_string = input_string.replace(", 'certifications'", "; certifications")
    input_string = input_string.replace("'is_influencer", "is_influencer")
    input_string = input_string.replace("is_influencer'", "is_influencer")
    input_string = input_string.replace("1;", "1")
    input_string = input_string.replace(",\nis_influencer", "; is_influencer")
    feature_list = input_string.split('; ')
    
    data = {}
    for feature in feature_list:
        key, value = feature.split(': ', 1)
        try:
            data[key] = ast.literal_eval(value)
        except (ValueError, SyntaxError):
            data[key] = value
    df = pd.DataFrame([data])
    
    return df

expected_columns = set(influencers.columns)
examples_df = pd.DataFrame()
for example in valid_examples:
    try:
        # Remove examples out of format
        df_example = string_to_df_row(example)
        if not set(df_example.columns).issubset(expected_columns):
            continue
        examples_df = pd.concat([examples_df, df_example])
    except: 
        print("failed")

failed
failed
failed


In [214]:
original_df = pd.read_pickle("data/undersampled_df.pickle")
original_df = original_df[['about', 'connections', 'is_influencer', 'certifications',
       'education', 'experience', 'followers',
       'languages', 'position', 'posts',
       'recommendations', 'recommendations_count', 'volunteer_experience',
       'сourses', 'certifications_count', 'volunteer_experience_count',
       'сourses_count', 'education_count', 'experience_count', 'posts_count',
       'languages_count']]

In [205]:
# Creating new features for the new examples that existing in the original_df
examples_df['certifications_count'] = examples_df['certifications'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['volunteer_experience_count'] = examples_df['volunteer_experience'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['сourses_count'] = examples_df['сourses'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['education_count'] = examples_df['education'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['experience_count'] = examples_df['experience'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['posts_count'] = examples_df['posts'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['languages_count'] = examples_df['languages'].apply(lambda x: len(x) if isinstance(x, list) else None)
examples_df['recommendations_count'] = examples_df['recommendations'].apply(lambda x: len(x) if isinstance(x, list) else None)

In [206]:
# Another Heuristic for removing bad examples
examples_df = examples_df.dropna(subset=['certifications_count', 'volunteer_experience_count', 'сourses_count', 'education_count', 'experience_count', 'posts_count', 'languages_count'])
examples_df = examples_df[['about', 'connections', 'is_influencer', 'certifications',
       'education', 'experience', 'followers',
       'languages', 'position', 'posts',
       'recommendations', 'recommendations_count', 'volunteer_experience',
       'сourses', 'certifications_count', 'volunteer_experience_count',
       'сourses_count', 'education_count', 'experience_count', 'posts_count',
       'languages_count']]

# Add the new examples to the original_df and save the final data to pickle
df_for_modeling = pd.concat([original_df, examples_df])
df_for_modeling.to_pickle('df_for_modeling.pickle')