# BAB 3

##### Import Libraries

In [2]:
import os
import ast
import json
from langdetect import detect, LangDetectException
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

##### Paths

In [3]:
# dataset
post_info_path = '../../dataset/post_info.txt'
images_dir = '../../dataset/images'
json_dir = '../../dataset/json'
profiles_influencers_dir = '../../dataset/profiles_influencers'

# preprocessed data
missing_images_posts_path = '../../dataset/missing_images_posts.txt'
post_with_existing_images_info_path = '../../dataset/post_with_existing_images_info.txt'

missing_json_posts_path = '../../dataset/missing_json_posts.txt'
post_with_existing_images_and_json_info_path = '../../dataset/post_with_existing_images_and_json_info.txt'

missing_caption_posts_path = '../../dataset/missing_caption_posts.txt'
post_with_existing_images_json_and_caption_info_path = '../../dataset/post_with_existing_images_json_and_caption_info.txt'

missing_english_caption_posts_path = '../../dataset/missing_english_caption_posts.txt'
post_with_existing_images_json_caption_and_english_info_path = '../../dataset/post_with_existing_images_json_caption_and_english_info.txt'

contain_giveaway_caption_posts_path = '../../dataset/contain_giveaway_caption_posts.txt'
post_clean_info_path = '../../dataset/post_clean_info.txt'

influencers_list_path = '../../dataset/influencers_list_of_clean_data.csv'
influencers_list_clean_path = '../../dataset/influencers_list_clean.csv'
influencers_error_list_path = '../../dataset/influencers_error_list.txt'

# exploration data
exploration_data_dir = '../../dataset/results'

exploration_data_clean_dir = '../../dataset/clean'

exploration_data_clean_banget_dir = '../../inti/eda/clean-banget'

post_with_micro_influencer_info_path = '../../dataset/post_with_micro_influencer_info.txt'

post_with_micro_influencer_clean_info_path = '../../dataset/post_with_micro_influencer_clean_info.txt'

post_sampled_info_path = '../../dataset/post_sampled_info.txt'

post_13000_sampled_info_path = '../../dataset/post_13000_sampled_info.txt'

post_34000_sampled_info_path = '../../dataset/post_34000_sampled_info.txt'

post_34000_sampled_clean_info_path = '../../dataset/post_34000_sampled_clean_info.txt'

influencers_after_sampling_list_path = '../../dataset/influencers_after_sampling_list.csv'

influencers_13000_after_sampling_list_path = '../../dataset/influencers_13000_after_sampling_list.csv'

influencers_34000_after_sampling_list_path = '../../dataset/influencers_34000_after_sampling_list.csv'

influencers_34000_after_sampling_list_clean_path = '../../dataset/influencers_34000_after_sampling_list_clean.csv'

exploration_data_after_sampling_dir = '../../dataset/results-after-exploration'

exploration_13000_data_after_sampling_dir = '../../dataset/results-13000-after-exploration'

exploration_34000_data_after_sampling_dir = '../../dataset/results-34000-after-exploration'

exploration_34000_data_after_sampling_clean_dir = '../../inti/eda/clean-banget/after-sampling'

exploration_34000_data_after_sampling_clean_banget_dir = '../../inti/model-results/results-34000-after-exploration-clean-banget'

##### Total posts **(raw from original dataset)**

In [3]:
def count_posts(file_path=post_info_path):
    count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                count += 1
    return count

In [4]:
total_posts_original = count_posts()
print(f"Total posts: {total_posts_original}")

Total posts: 1601074


## Preprocessing Data

##### Create and total posts with missing image(s)

In [12]:
def create_missing_images_posts_file(post_info_path, images_dir):
    missing_posts = []
    with open(post_info_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            # Split row on tab and use the last column for image list
            columns = line.strip().split("\t")
            # Use ast.literal_eval to get a list of image names
            try:
                images = ast.literal_eval(columns[-1])
            except Exception:
                images = []
            missing = False
            for img in images:
                if img.lower().endswith('.jpg'):
                    image_path = os.path.join(images_dir, img)
                    if not os.path.exists(image_path):
                        missing = True
                        break
            if missing:
                missing_posts.append(line)

    # Write missing posts to file
    with open(missing_images_posts_path, "w", encoding="utf-8") as f_out:
        f_out.writelines(missing_posts)

In [13]:
# create_missing_images_posts_file(post_info_path, images_dir)

In [14]:
def count_total_posts_with_missing_images(file_path=missing_images_posts_path):
    return count_posts(file_path)

In [15]:
total_posts_with_missing_images = count_total_posts_with_missing_images()
print(f"Total posts with missing images: {total_posts_with_missing_images}")

Total posts with missing images: 814


##### Create new **post info txt** file with no missing image(s)

In [16]:
def create_no_missing_images_post_info_file(post_info_path, missing_images_posts_path, post_with_existing_images_info_path):
    with open(missing_images_posts_path, "r", encoding="utf-8") as f_missing:
        missing_posts = set(line.strip() for line in f_missing)

    with open(post_info_path, "r", encoding="utf-8") as f_in, \
            open(post_with_existing_images_info_path, "w", encoding="utf-8") as f_out:

        row_number = 0
        for line in f_in:
            if line.strip() and line.strip() not in missing_posts:
                f_out.write(str(row_number) + line[line.find('\t'):])
                row_number += 1

In [17]:
# create_no_missing_images_post_info_file(post_info_path, missing_images_posts_path, post_with_existing_images_info_path)

##### Total posts **(after removing posts with missing images)**

In [18]:
def count_post_with_existing_images(file_path=post_with_existing_images_info_path):
    return count_posts(file_path)

In [19]:
total_posts_with_existing_images = count_post_with_existing_images()
print(f"Total posts with existing images: {total_posts_with_existing_images}")


Total posts with existing images: 1600260


##### Create and total posts with missing json file

In [20]:
def create_missing_json_posts_file(post_with_existing_images_info_path, json_dir, missing_json_posts_path):
    missing_json_posts = []
    with open(post_with_existing_images_info_path, "r", encoding="utf-8") as f_in:
        for line in f_in:
            line = line.strip()
            if not line:
                continue

            parts = line.split('\t')
            if len(parts) < 4:
                continue

            json_file_name = parts[3]
            json_file_path = os.path.join(json_dir, json_file_name)

            if not os.path.exists(json_file_path):
                missing_json_posts.append(line + '\n')
                print(f"JSON file not found: {json_file_path}")

    with open(missing_json_posts_path, "w", encoding="utf-8") as f_out:
        f_out.writelines(missing_json_posts)

In [21]:
# create_missing_json_posts_file(post_with_existing_images_info_path, json_dir, missing_json_posts_path)

In [22]:
def count_total_posts_with_missing_json(file_path=missing_json_posts_path):
    return count_posts(file_path)

In [23]:
total_posts_with_missing_json = count_total_posts_with_missing_json()
print(f"Total posts with missing JSON: {total_posts_with_missing_json}")

Total posts with missing JSON: 125


##### Create new **post info txt** file with no missing image(s) and missing json

In [24]:
def create_no_missing_images_and_no_missing_json_post_info_file(post_with_existing_images_info_path, missing_json_posts_path, post_with_existing_images_and_json_info_path):
    with open(missing_json_posts_path, "r", encoding="utf-8") as f_missing:
        missing_posts = set(line.strip() for line in f_missing)

    with open(post_with_existing_images_info_path, "r", encoding="utf-8") as f_in, \
            open(post_with_existing_images_and_json_info_path, "w", encoding="utf-8") as f_out:

        row_number = 0
        for line in f_in:
            if line.strip() and line.strip() not in missing_posts:
                f_out.write(str(row_number) + line[line.find('\t'):])
                row_number += 1

In [25]:
create_no_missing_images_and_no_missing_json_post_info_file(post_with_existing_images_info_path, missing_json_posts_path, post_with_existing_images_and_json_info_path)

##### Total posts **(after removing posts with missing images and missing json)**

In [26]:
def count_post_with_existing_images_and_json(file_path=post_with_existing_images_and_json_info_path):
    return count_posts(file_path)

In [27]:
total_posts_with_existing_images_and_json = count_post_with_existing_images_and_json()
print(f"Total posts with existing images and JSON: {total_posts_with_existing_images_and_json}")

Total posts with existing images and JSON: 1600135


##### Create and total posts with missing caption

In [28]:
def create_missing_caption_posts_file(post_with_existing_images_and_json_info_path, json_dir, missing_caption_posts_path):
    # Read existing missing posts into a set for quick lookup
    existing_missing_posts = set()
    last_processed_line = None  # Store the last processed line

    if os.path.exists(missing_caption_posts_path):
        with open(missing_caption_posts_path, "r", encoding="utf-8") as f_missing:
            for line in f_missing:
                line = line.strip()
                existing_missing_posts.add(line)
                last_processed_line = line  # Update the last processed line

    with open(post_with_existing_images_and_json_info_path, "r", encoding="utf-8") as f_in, \
            open(missing_caption_posts_path, "a", encoding="utf-8") as f_out:

        if last_processed_line:
            # Skip lines until the last processed line is found
            for line in f_in:
                line = line.strip()
                if line == last_processed_line:
                    print(f"Resuming from: {line}")
                    break  # Start processing from the next line
            else:
                print("Last processed line not found in input file. Processing from the beginning.")
                f_in.seek(0)  # Reset file pointer to the beginning

        for line in f_in:
            line = line.strip()
            if not line:
                continue

            # Skip if the post is already in the missing_caption_posts file
            if line in existing_missing_posts:
                continue

            parts = line.split('\t')
            if len(parts) < 4:
                continue

            json_file_name = parts[3]
            json_file_path = os.path.join(json_dir, json_file_name)

            missing = False
            try:
                with open(json_file_path, 'r', encoding='utf-8') as f_json:
                    json_data = json.load(f_json)
                    if ('edge_media_to_caption' in json_data and
                            'edges' in json_data['edge_media_to_caption'] and
                            len(json_data['edge_media_to_caption']['edges']) > 0):
                        caption_text = json_data['edge_media_to_caption']['edges'][0]['node']['text']
                        if not caption_text.strip():
                            print(f"Missing caption: {json_file_path}")
                            missing = True
                    else:
                        missing = True
            except FileNotFoundError:
                print(f"JSON file not found: {json_file_path}")
                missing = True
            except json.JSONDecodeError:
                print(f"Error decoding JSON file: {json_file_path}")
                missing = True
            except KeyError:
                print(f"KeyError in JSON file: {json_file_path}")
                missing = True

            if missing:
                f_out.write(line + '\n')
                f_out.flush()

In [29]:
create_missing_caption_posts_file(post_with_existing_images_and_json_info_path, json_dir, missing_caption_posts_path)

Resuming from: 1599407	kitana_red	0	1388894981363963867.json	['1388894981363963867.jpg']


In [30]:
def count_total_posts_with_missing_caption(file_path=missing_caption_posts_path):
    return count_posts(file_path)

In [31]:
total_posts_with_missing_caption = count_total_posts_with_missing_caption()
print(f"Total posts with missing caption: {total_posts_with_missing_caption}")

Total posts with missing caption: 8057


##### Create new **post info txt** file with no missing image(s), json, and caption

In [32]:
def create_no_missing_images_json_and_caption_post_info_file(post_with_existing_images_and_json_info_path, missing_caption_posts_path, post_with_existing_images_json_and_caption_info_path):
    with open(missing_caption_posts_path, "r", encoding="utf-8") as f_missing:
        missing_posts = set(line.strip() for line in f_missing)

    with open(post_with_existing_images_and_json_info_path, "r", encoding="utf-8") as f_in, \
            open(post_with_existing_images_json_and_caption_info_path, "w", encoding="utf-8") as f_out:

        row_number = 0
        for line in f_in:
            if line.strip() and line.strip() not in missing_posts:
                f_out.write(str(row_number) + line[line.find('\t'):])
                row_number += 1

In [33]:
create_no_missing_images_json_and_caption_post_info_file(post_with_existing_images_and_json_info_path, missing_caption_posts_path, post_with_existing_images_json_and_caption_info_path)

##### Total posts **(after removing posts with missing images, json, and caption)**

In [4]:
def count_post_with_existing_images_json_and_caption(file_path=post_with_existing_images_json_and_caption_info_path):
    return count_posts(file_path)

In [5]:
total_posts_with_no_missing_data = count_post_with_existing_images_json_and_caption(post_with_existing_images_json_and_caption_info_path)
print(f"Total posts with no missing data: {total_posts_with_no_missing_data}")

Total posts with no missing data: 1592078


##### Create and total posts with non English caption

In [36]:
def create_non_english_caption_posts_file(post_with_existing_images_json_and_caption_info_path, json_dir, missing_english_caption_posts_path):
    non_english_posts = set()
    last_processed_line = None

    # Read existing non-English posts into a set for quick lookup
    if os.path.exists(missing_english_caption_posts_path):
        with open(missing_english_caption_posts_path, "r", encoding="utf-8") as f_missing:
            for line in f_missing:
                line = line.strip()
                non_english_posts.add(line)
                last_processed_line = line  # Update the last processed line

    with open(post_with_existing_images_json_and_caption_info_path, "r", encoding="utf-8") as f_in, \
            open(missing_english_caption_posts_path, "a", encoding="utf-8") as f_out:

        if last_processed_line:
            # Skip lines until the last processed line is found
            for line in f_in:
                line = line.strip()
                if line == last_processed_line:
                    print(f"Resuming from: {line}")
                    break
            else:
                print("Last processed line not found in input file. Processing from the beginning.")
                f_in.seek(0)  # Reset file pointer to the beginning

        for line in f_in:
            line = line.strip()
            if not line:
                continue

            # Skip if the post is already in the non-English posts file
            if line in non_english_posts:
                continue

            parts = line.split('\t')
            if len(parts) < 4:
                continue

            json_file_name = parts[3]
            json_file_path = os.path.join(json_dir, json_file_name)

            try:
                with open(json_file_path, 'r', encoding='utf-8') as f_json:
                    json_data = json.load(f_json)
                    if ('edge_media_to_caption' in json_data and
                            'edges' in json_data['edge_media_to_caption'] and
                            len(json_data['edge_media_to_caption']['edges']) > 0):
                        caption_text = json_data['edge_media_to_caption']['edges'][0]['node']['text']
                        try:
                            if caption_text.strip() and detect(caption_text) != 'en':
                                non_english_posts.add(line)
                                f_out.write(line + '\n')
                                f_out.flush()
                                print(f"Non-English caption found in: {json_file_path}")
                        except LangDetectException:
                            print(f"Language detection failed for: {json_file_path}")
                            continue
            except Exception as e:
                print(f"Error processing {json_file_path}: {str(e)}")
                continue

In [37]:
create_non_english_caption_posts_file(
    post_with_existing_images_json_and_caption_info_path,
    json_dir,
    missing_english_caption_posts_path
)

Resuming from: 1592077	weartoeatiff	0	1986260829332127963.json	['1986260829332127963.jpg']


In [38]:
def count_total_non_english_posts(file_path=missing_english_caption_posts_path):
    return count_posts(file_path)

In [39]:
total_non_english_posts = count_total_non_english_posts()
print(f"Total posts with non-English captions: {total_non_english_posts}")

Total posts with non-English captions: 238369


##### Create new **post info txt** file with no missing image(s), json, caption, and non-English caption

In [40]:
def create_no_missing_data_and_no_non_english_caption_post_info_file(post_with_existing_images_json_and_caption_info_path, missing_english_caption_posts_path, post_with_existing_images_json_caption_and_english_info_path):
    with open(missing_english_caption_posts_path, "r", encoding="utf-8") as f_missing:
        non_english_posts = set(line.strip() for line in f_missing)

    with open(post_with_existing_images_json_and_caption_info_path, "r", encoding="utf-8") as f_in, \
            open(post_with_existing_images_json_caption_and_english_info_path, "w", encoding="utf-8") as f_out:

        row_number = 0
        for line in f_in:
            if line.strip() and line.strip() not in non_english_posts:
                f_out.write(str(row_number) + line[line.find('\t'):])
                row_number += 1

In [41]:
create_no_missing_data_and_no_non_english_caption_post_info_file(
    post_with_existing_images_json_and_caption_info_path,
    missing_english_caption_posts_path,
    post_with_existing_images_json_caption_and_english_info_path
)

##### Total posts **(after removing posts with missing image(s), json, caption, and non-English caption)**

In [6]:
def count_post_with_no_missing_data_and_no_non_english_caption(file_path=post_with_existing_images_json_caption_and_english_info_path):
    return count_posts(file_path)

In [7]:
total_posts_with_no_missing_data_and_no_non_english_caption = count_post_with_no_missing_data_and_no_non_english_caption(
    post_with_existing_images_json_caption_and_english_info_path
)
print(f"Total posts with no missing data and non-English caption: {total_posts_with_no_missing_data_and_no_non_english_caption}")

Total posts with no missing data and non-English caption: 1353709


##### Create and total posts with no contain giveaway caption

In [44]:
def create_no_contain_giveaway_caption_posts_file(post_with_existing_images_json_caption_and_english_info_path, json_dir, contain_giveaway_caption_posts_path):
    contain_giveaway_posts = set()
    last_processed_line = None

    # Read existing contain giveaway posts into a set for quick lookup
    if os.path.exists(contain_giveaway_caption_posts_path):
        with open(contain_giveaway_caption_posts_path, "r", encoding="utf-8") as f_missing:
            for line in f_missing:
                line = line.strip()
                contain_giveaway_posts.add(line)
                last_processed_line = line

    with open(post_with_existing_images_json_caption_and_english_info_path, "r", encoding="utf-8") as f_in, \
            open(contain_giveaway_caption_posts_path, "a", encoding="utf-8") as f_out:

        if last_processed_line:
            # Skip lines until the last processed line is found
            for line in f_in:
                line = line.strip()
                if line == last_processed_line:
                    print(f"Resuming from: {line}")
                    break
            else:
                print("Last processed line not found in input file. Processing from the beginning.")
                f_in.seek(0)

        for line in f_in:
            line = line.strip()
            if not line:
                continue    

            # Skip if the post is already in the contain giveaway posts file
            if line in contain_giveaway_posts:
                continue

            parts = line.split('\t')
            if len(parts) < 4:
                continue

            json_file_name = parts[3]
            json_file_path = os.path.join(json_dir, json_file_name)

            try:
                with open(json_file_path, 'r', encoding='utf-8') as f_json:
                    json_data = json.load(f_json)
                    if 'edge_media_to_caption' in json_data and 'edges' in json_data['edge_media_to_caption'] and len(json_data['edge_media_to_caption']['edges']) > 0:
                        caption_text = json_data['edge_media_to_caption']['edges'][0]['node']['text']
                        if 'giveaway' in caption_text.lower():
                            contain_giveaway_posts.add(line)
                            f_out.write(line + '\n')
                            f_out.flush()
                            print(f"Contain giveaway caption found in: {json_file_path}")
            except Exception as e:
                print(f"Error processing {json_file_path}: {str(e)}")
                continue

In [11]:
create_no_contain_giveaway_caption_posts_file(post_with_existing_images_json_caption_and_english_info_path, json_dir, contain_giveaway_caption_posts_path)

Contain giveaway caption found in: ../../dataset/json\1734154912567435640.json
Contain giveaway caption found in: ../../dataset/json\1738780832048556961.json
Contain giveaway caption found in: ../../dataset/json\1739263002879033158.json
Contain giveaway caption found in: ../../dataset/json\1739302443454729251.json
Contain giveaway caption found in: ../../dataset/json\1740061834481771905.json
Contain giveaway caption found in: ../../dataset/json\1743108500814675733.json
Contain giveaway caption found in: ../../dataset/json\1743608020736665614.json
Contain giveaway caption found in: ../../dataset/json\1744314854510282900.json
Contain giveaway caption found in: ../../dataset/json\1744529133406662630.json
Contain giveaway caption found in: ../../dataset/json\1744579686161530876.json
Contain giveaway caption found in: ../../dataset/json\1744646158472826320.json
Contain giveaway caption found in: ../../dataset/json\1746022435192671143.json
Contain giveaway caption found in: ../../dataset/jso

In [49]:
def count_total_posts_with_no_contain_giveaway_caption(file_path=contain_giveaway_caption_posts_path):
    return count_posts(file_path)

In [50]:
total_posts_with_no_contain_giveaway_caption = count_total_posts_with_no_contain_giveaway_caption(
    contain_giveaway_caption_posts_path
)
print(f"Total posts with no contain giveaway caption: {total_posts_with_no_contain_giveaway_caption}")

Total posts with no contain giveaway caption: 28180


##### Create new **post info txt** file with no missing image(s), json, caption, non-English caption, and contain giveaway caption

In [51]:
def create_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption_post_info_file(post_with_existing_images_json_caption_and_english_info_path, contain_giveaway_caption_posts_path, post_clean_info_path):
    with open(contain_giveaway_caption_posts_path, "r", encoding="utf-8") as f_missing:
        contain_giveaway_posts = set(line.strip() for line in f_missing)

    with open(post_with_existing_images_json_caption_and_english_info_path, "r", encoding="utf-8") as f_in, \
            open(post_clean_info_path, "w", encoding="utf-8") as f_out:

        row_number = 0
        for line in f_in:
            if line.strip() and line.strip() not in contain_giveaway_posts:
                f_out.write(str(row_number) + line[line.find('\t'):])
                row_number += 1

In [52]:
create_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption_post_info_file(post_with_existing_images_json_caption_and_english_info_path, contain_giveaway_caption_posts_path, post_clean_info_path)

##### Total posts **(after removing posts with missing image(s), json, caption, non-English caption, and contain giveaway caption)**

In [8]:
def count_post_with_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption(file_path=post_clean_info_path):
    return count_posts(file_path)

In [9]:
total_posts_with_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption = count_post_with_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption(
    post_clean_info_path
)
print(f"Total posts with no missing data, non-English caption, and not contain giveaway caption: {total_posts_with_no_missing_data_and_no_non_english_caption_and_no_contain_giveaway_caption}")

Total posts with no missing data, non-English caption, and not contain giveaway caption: 1325529


##### Create influencer list file

In [57]:
def create_influencer_list(clean_data_path, profiles_dir, output_csv_path, error_list_path):
    # Create/open CSV file with header
    with open(output_csv_path, 'w', encoding='utf-8') as csv_file:
        csv_file.write('influencer_name,followers,followees,posts,category\n')
    
    # Create/open error list file
    with open(error_list_path, 'w', encoding='utf-8') as error_file:
        pass
    
    # Track processed influencers to avoid duplicates
    processed_influencers = set()
    
    # Process clean data file line by line
    with open(clean_data_path, 'r', encoding='utf-8') as f:
        for index, line in enumerate(f):
            if not line.strip():
                continue
                
            # Extract influencer name (second element after splitting)
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
                
            influencer_name = parts[1]
            
            # Skip if already processed
            if influencer_name in processed_influencers:
                continue
                
            processed_influencers.add(influencer_name)
            
            # Try to read influencer profile
            profile_path = os.path.join(profiles_dir, influencer_name)
            try:
                with open(profile_path, 'r', encoding='utf-8') as profile_file:
                    profile_data = profile_file.readline().strip().split('\t')
                    
                    # Extract required fields (followers, followees, posts, category)
                    followers = profile_data[1] if len(profile_data) > 1 else 'NULL'
                    followees = profile_data[2] if len(profile_data) > 2 else 'NULL'
                    posts = profile_data[3] if len(profile_data) > 3 else 'NULL'
                    category = profile_data[6] if len(profile_data) > 6 else 'NULL'
                    
                    # Write to CSV
                    with open(output_csv_path, 'a', encoding='utf-8') as csv_file:
                        csv_file.write(f'{influencer_name},{followers},{followees},{posts},{category}\n')
                        csv_file.flush()
                        
            except (FileNotFoundError, IndexError, Exception) as e:
                # Log error and continue
                with open(error_list_path, 'a', encoding='utf-8') as error_file:
                    error_file.write(f'{index}\t{influencer_name}\n')
                    error_file.flush()
                
                # Still write to CSV with NULL values
                with open(output_csv_path, 'a', encoding='utf-8') as csv_file:
                    csv_file.write(f'{influencer_name},NULL,NULL,NULL,NULL\n')
                    csv_file.flush()

In [58]:
create_influencer_list(
    post_clean_info_path,
    profiles_influencers_dir,
    influencers_list_clean_path,
    influencers_error_list_path
)

## Data Exploration

##### Exploration

In [13]:
def apply_chart_styling(ax, title, xlabel, ylabel, rotation=0, y_margin_top=0.15):
    """
    Apply consistent styling to charts with larger fonts and proper spacing.
    
    Parameters:
    ax: matplotlib axis object
    title (str): Chart title
    xlabel (str): X-axis label
    ylabel (str): Y-axis label
    rotation (int): X-axis tick rotation angle, default 45
    y_margin_top (float): Additional margin at top as fraction of max y value
    """
    plt.title(title, fontweight='bold', pad=20, fontsize=16)
    plt.xlabel(xlabel, fontsize=16, labelpad=15)
    plt.ylabel(ylabel, fontsize=16, labelpad=15)
    plt.xticks(rotation=rotation, fontsize=16)
    plt.yticks(fontsize=16)
    
    # Add spacing for axis labels
    ax.tick_params(axis='y', pad=10)
    ax.tick_params(axis='x', pad=10)
    
    # Adjust y-axis limits to provide space for bar labels
    y_min, y_max = ax.get_ylim()
    ax.set_ylim(y_min, y_max * (1 + y_margin_top))

def add_bar_labels(ax, values, spacing_factor=0.05, fontsize=14):
    """
    Add count labels on top of bars with consistent formatting.
    
    Parameters:
    ax: matplotlib axis object
    values: array of values to display
    spacing_factor (float): Space above bars as fraction of max value
    fontsize (int): Font size for labels
    """
    for i, v in enumerate(values):
        # Format large numbers with points instead of commas if needed
        label = f'{v:,.0f}'.replace(',', '.') if v >= 1000 else str(v)
        ax.text(i, v + max(values) * spacing_factor, label,
                ha='center', va='bottom', fontweight='bold', fontsize=fontsize)

def explore_influencer_data(exploration_data_dir, influencers_list_path, post_data_path):
    """
    Explore influencer data and generate statistics and visualizations.
    
    Parameters:
    exploration_data_dir (str): Directory to save exploration results
    influencers_list_path (str): Path to the influencers CSV file
    post_data_path (str): Path to the post data file
    """
    # Create exploration_data_dir if it doesn't exist
    if not os.path.exists(exploration_data_dir):
        os.makedirs(exploration_data_dir)
        
    # Read the influencers CSV
    df = pd.read_csv(influencers_list_path)
    
    # Convert numeric columns
    numeric_columns = ['followers', 'followees', 'posts']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col].replace('NULL', pd.NA), errors='coerce')
    
    # Basic statistics
    stats = df[numeric_columns].describe()
    stats.to_csv(os.path.join(exploration_data_dir, 'influencer_statistics.csv'))
    
    # Create more granular follower ranges
    follower_ranges = [0, 10000, 50000, 100000, 150000, 200000, 250000, 300000]
    range_labels = ['<10K', '10K-50K', '50K-100K', '100K-150K', '150K-200K', 
                   '200K-250K', '250K-300K']
    
    df['follower_range'] = pd.cut(df['followers'], 
                                 bins=follower_ranges, 
                                 labels=range_labels,
                                 ordered=True)
    
    # Get value counts in the specific order
    follower_dist = df['follower_range'].value_counts().reindex(range_labels)
    
    # Plot follower range distribution
    plt.figure(figsize=(15, 6))
    ax = follower_dist.plot(kind='bar')
    apply_chart_styling(ax, 
                       'Distribusi Jumlah Influencer Berdasarkan Jangkauan Jumlah Follower',
                       'Jangkauan Jumlah Follower', 
                       'Jumlah Influencer')
    add_bar_labels(ax, follower_dist.values)
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'follower_range_distribution.png'))
    plt.close()
    
    # Category distribution
    plt.figure(figsize=(12, 6))
    category_counts = df['category'].value_counts()
    ax = category_counts.plot(kind='bar')
    plt.title('Distribution of Influencer Categories')
    plt.xlabel('Category')
    plt.ylabel('Number of Influencers')
    plt.xticks(rotation=45)
    
    # Add count labels on top of each bar
    for i, v in enumerate(category_counts.values):
        ax.text(i, v + max(category_counts.values) * 0.01, str(v), 
                ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'category_distribution.png'))
    plt.close()
    
    # Correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix of Numeric Variables')
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'correlation_matrix.png'))
    plt.close()
    
    # Create dictionary to store post counts per influencer
    influencer_post_counts = {}
    
    # Read and count posts from the clean data file
    with open(post_data_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                influencer_name = parts[1]
                if influencer_name in influencer_post_counts:
                    influencer_post_counts[influencer_name] += 1
                else:
                    influencer_post_counts[influencer_name] = 1
    
    # Add post counts to dataframe
    df['actual_posts'] = df['influencer_name'].map(influencer_post_counts)
    df['actual_posts'] = df['actual_posts'].fillna(0).astype(int)
    
    # Calculate total posts for each follower range using actual posts
    follower_range_posts = df.groupby('follower_range')['actual_posts'].sum().reindex(range_labels)
    
    # Plot follower range posts distribution
    plt.figure(figsize=(15, 6))
    ax = follower_range_posts.plot(kind='bar')
    apply_chart_styling(ax,
                       'Distribusi Jumlah Post Berdasarkan Jangkauan Follower',
                       'Jangkauan Follower',
                       'Jumlah Post')
    add_bar_labels(ax, follower_range_posts.values)
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'follower_range_posts_distribution.png'))
    plt.close()
    
    # Save summary to text file
    with open(os.path.join(exploration_data_dir, 'summary.txt'), 'w') as f:
        f.write("Influencer Data Summary\n")
        f.write("======================\n\n")
        f.write(f"Total number of influencers: {len(df)}\n")
        f.write(f"Number of unique categories: {df['category'].nunique()}\n\n")
        
        f.write("Detailed Statistics\n")
        f.write("-----------------\n\n")
        
        # Detailed statistics for followers
        f.write("Followers Statistics:\n")
        f.write(f"Mean: {df['followers'].mean():,.2f}\n")
        f.write(f"Std: {df['followers'].std():,.2f}\n")
        f.write(f"Min: {df['followers'].min():,.0f}\n")
        f.write(f"25%: {df['followers'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['followers'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['followers'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['followers'].max():,.0f}\n\n")
        
        # Detailed statistics for followees
        f.write("Followees Statistics:\n")
        f.write(f"Mean: {df['followees'].mean():,.2f}\n")
        f.write(f"Std: {df['followees'].std():,.2f}\n")
        f.write(f"Min: {df['followees'].min():,.0f}\n")
        f.write(f"25%: {df['followees'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['followees'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['followees'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['followees'].max():,.0f}\n\n")
        
        # Detailed statistics for posts
        f.write("Posts Statistics:\n")
        f.write(f"Mean: {df['posts'].mean():,.2f}\n")
        f.write(f"Std: {df['posts'].std():,.2f}\n")
        f.write(f"Min: {df['posts'].min():,.0f}\n")
        f.write(f"25%: {df['posts'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['posts'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['posts'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['posts'].max():,.0f}\n\n")
        
        f.write("\nFollower Range Distribution:\n")
        f.write(follower_dist.to_string())
        
        f.write("\n\nFollower Range Posts Distribution:\n")
        f.write("Total posts for each follower range:\n")
        f.write(follower_range_posts.to_string())
        
        f.write("\n\nCategory Distribution:\n")
        f.write(df['category'].value_counts().to_string())
        
        f.write("\nPost Distribution by Follower Range:\n")
        f.write("---------------------------------\n")
        for range_label in range_labels:
            total_posts = follower_range_posts[range_label]
            influencers = len(df[df['follower_range'] == range_label])
            f.write(f"{range_label}:\n")
            f.write(f"  Total Posts: {total_posts:,.0f}\n")
            f.write(f"  Number of Influencers: {influencers}\n")
            f.write(f"  Average Posts per Influencer: {total_posts/influencers if influencers > 0 else 0:,.1f}\n\n")
        
        f.write(f"\nTotal Posts Across All Ranges: {follower_range_posts.sum():,.0f}\n")
        f.write(f"Total Influencers: {len(df):,.0f}\n")
        f.write(f"Overall Average Posts per Influencer: {follower_range_posts.sum()/len(df):,.1f}\n")

In [14]:
explore_influencer_data(
    exploration_data_dir=exploration_data_clean_banget_dir,
    influencers_list_path=influencers_list_clean_path, 
    post_data_path=post_clean_info_path
)

  follower_range_posts = df.groupby('follower_range')['actual_posts'].sum().reindex(range_labels)


## Data Preprocessing After Exploration

##### Create new **post info csv** file with micro influencer only

In [9]:
def create_micro_influencer_post_info_file(influencers_list_path, post_with_existing_images_json_caption_and_english_info_path, post_with_micro_influencer_info_path):
    
    # Define micro influencer follower bounds
    micro_lower = 10000
    micro_upper = 100000

    # Read the influencer list and convert followers to numeric
    influencers_df = pd.read_csv(influencers_list_path)
    influencers_df['followers'] = pd.to_numeric(influencers_df['followers'].replace('NULL', pd.NA), errors='coerce')

    # Create a set of micro influencer names
    micro_influencers = set(
        influencers_df[(influencers_df['followers'] >= micro_lower) & (influencers_df['followers'] < micro_upper)]['influencer_name']
    )

    # Open the source post info text file and the destination CSV file.
    with open(post_with_existing_images_json_caption_and_english_info_path, 'r', encoding='utf-8') as fin, \
         open(post_with_micro_influencer_info_path, 'w', encoding='utf-8') as fout:
        
        row_number = 0
        # Optionally, you can add a header if needed. For now we're writing rows as filtered.
        for line in fin:
            line = line.strip()
            if not line:
                continue

            # Expecting each line to be in the format: row_number<TAB>influencer_name<TAB>...
            parts = line.split('\t')
            if len(parts) < 2:
                continue

            influencer_name = parts[1]
            if influencer_name in micro_influencers:
                # Write the filtered post info with a new row number.
                fout.write(str(row_number) + line[line.find('\t'):] + "\n")
                row_number += 1

In [12]:
create_micro_influencer_post_info_file(
    influencers_list_clean_path,
    post_clean_info_path,
    post_with_micro_influencer_clean_info_path
)

##### Sampling

In [30]:
def create_stratified_sample(input_path, output_path, target_size):
    # Read influencer data
    influencers_df = pd.read_csv(influencers_list_path)
    influencers_df['followers'] = pd.to_numeric(influencers_df['followers'].replace('NULL', pd.NA), errors='coerce')
    
    # Create follower range groups
    micro_influencers = {}
    micro_influencers['10K-50K'] = set(
        influencers_df[
            (influencers_df['followers'] >= 10000) & 
            (influencers_df['followers'] < 50000)
        ]['influencer_name']
    )
    micro_influencers['50K-100K'] = set(
        influencers_df[
            (influencers_df['followers'] >= 50000) & 
            (influencers_df['followers'] < 100000)
        ]['influencer_name']
    )
    
    # Calculate target sizes for each stratum
    stratum_sizes = {
        '10K-50K': int(target_size * 0.766),  # 76.6% of posts
        '50K-100K': int(target_size * 0.234)   # 23.4% of posts
    }
    
    # Store posts by stratum
    posts_by_stratum = {'10K-50K': [], '50K-100K': []}
    
    # First pass: categorize posts by stratum
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
                
            influencer_name = parts[1]
            if influencer_name in micro_influencers['10K-50K']:
                posts_by_stratum['10K-50K'].append(line)
            elif influencer_name in micro_influencers['50K-100K']:
                posts_by_stratum['50K-100K'].append(line)
    
    # Random sampling within each stratum
    sampled_posts = []
    for stratum, target in stratum_sizes.items():
        if posts_by_stratum[stratum]:
            # Ensure we don't try to sample more posts than available
            actual_sample_size = min(target, len(posts_by_stratum[stratum]))
            stratum_sample = random.sample(posts_by_stratum[stratum], actual_sample_size)
            sampled_posts.extend(stratum_sample)
    
    # Shuffle the final sample to mix posts from different strata
    random.shuffle(sampled_posts)
    
    # Write sampled posts to output file
    with open(output_path, 'w', encoding='utf-8') as f:
        for i, post in enumerate(sampled_posts):
            # Find the tab position and create new line with updated row number
            tab_pos = post.find('\t')
            new_line = str(i) + post[tab_pos:]
            f.write(new_line)

In [15]:
create_stratified_sample(
    post_with_micro_influencer_clean_info_path,
    post_34000_sampled_clean_info_path,
    target_size=34000
)

## Data Exploration After Sampling

##### Create influencer list file

In [32]:
def create_influencer_after_sampling_list(clean_data_path, profiles_dir, output_csv_path, error_list_path):
    # Create/open CSV file with header
    with open(output_csv_path, 'w', encoding='utf-8') as csv_file:
        csv_file.write('influencer_name,followers,followees,posts,category\n')
    
    # Create/open error list file
    with open(error_list_path, 'w', encoding='utf-8') as error_file:
        pass
    
    # Track processed influencers to avoid duplicates
    processed_influencers = set()
    
    # Process clean data file line by line
    with open(clean_data_path, 'r', encoding='utf-8') as f:
        for index, line in enumerate(f):
            if not line.strip():
                continue
                
            # Extract influencer name (second element after splitting)
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
                
            influencer_name = parts[1]
            
            # Skip if already processed
            if influencer_name in processed_influencers:
                continue
                
            processed_influencers.add(influencer_name)
            
            # Try to read influencer profile
            profile_path = os.path.join(profiles_dir, influencer_name)
            try:
                with open(profile_path, 'r', encoding='utf-8') as profile_file:
                    profile_data = profile_file.readline().strip().split('\t')
                    
                    # Extract required fields (followers, followees, posts, category)
                    followers = profile_data[1] if len(profile_data) > 1 else 'NULL'
                    followees = profile_data[2] if len(profile_data) > 2 else 'NULL'
                    posts = profile_data[3] if len(profile_data) > 3 else 'NULL'
                    category = profile_data[6] if len(profile_data) > 6 else 'NULL'
                    
                    # Write to CSV
                    with open(output_csv_path, 'a', encoding='utf-8') as csv_file:
                        csv_file.write(f'{influencer_name},{followers},{followees},{posts},{category}\n')
                        csv_file.flush()
                        
            except (FileNotFoundError, IndexError, Exception) as e:
                # Log error and continue
                with open(error_list_path, 'a', encoding='utf-8') as error_file:
                    error_file.write(f'{index}\t{influencer_name}\n')
                    error_file.flush()
                
                # Still write to CSV with NULL values
                with open(output_csv_path, 'a', encoding='utf-8') as csv_file:
                    csv_file.write(f'{influencer_name},NULL,NULL,NULL,NULL\n')
                    csv_file.flush()

In [33]:
create_influencer_after_sampling_list(
    post_34000_sampled_clean_info_path,
    profiles_influencers_dir,
    influencers_34000_after_sampling_list_clean_path,
    influencers_error_list_path
)

##### Exploration

In [4]:
def apply_chart_styling(ax, title, xlabel, ylabel, rotation=0, y_margin_top=0.15):
    """
    Apply consistent styling to charts with larger fonts and proper spacing.
    
    Parameters:
    ax: matplotlib axis object
    title (str): Chart title
    xlabel (str): X-axis label
    ylabel (str): Y-axis label
    rotation (int): X-axis tick rotation angle, default 45
    y_margin_top (float): Additional margin at top as fraction of max y value
    """
    plt.title(title, fontweight='bold', pad=20, fontsize=16)
    plt.xlabel(xlabel, fontsize=16, labelpad=15)
    plt.ylabel(ylabel, fontsize=16, labelpad=15)
    plt.xticks(rotation=rotation, fontsize=16)
    plt.yticks(fontsize=16)
    
    # Add spacing for axis labels
    ax.tick_params(axis='y', pad=10)
    ax.tick_params(axis='x', pad=10)
    
    # Adjust y-axis limits to provide space for bar labels
    y_min, y_max = ax.get_ylim()
    ax.set_ylim(y_min, y_max * (1 + y_margin_top))

def add_bar_labels(ax, values, spacing_factor=0.05, fontsize=14):
    """
    Add count labels on top of bars with consistent formatting.
    
    Parameters:
    ax: matplotlib axis object
    values: array of values to display
    spacing_factor (float): Space above bars as fraction of max value
    fontsize (int): Font size for labels
    """
    for i, v in enumerate(values):
        # Format large numbers with points instead of commas if needed
        label = f'{v:,.0f}'.replace(',', '.') if v >= 1000 else str(v)
        ax.text(i, v + max(values) * spacing_factor, label,
                ha='center', va='bottom', fontweight='bold', fontsize=fontsize)

def explore_influencer_data(exploration_data_dir, influencers_list_path, post_data_path):
    """
    Explore influencer data and generate statistics and visualizations.
    
    Parameters:
    exploration_data_dir (str): Directory to save exploration results
    influencers_list_path (str): Path to the influencers CSV file
    post_data_path (str): Path to the post data file
    """
    # Create exploration_data_dir if it doesn't exist
    if not os.path.exists(exploration_data_dir):
        os.makedirs(exploration_data_dir)
        
    # Read the influencers CSV
    df = pd.read_csv(influencers_list_path)
    
    # Convert numeric columns
    numeric_columns = ['followers', 'followees', 'posts']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col].replace('NULL', pd.NA), errors='coerce')
    
    # Basic statistics
    stats = df[numeric_columns].describe()
    stats.to_csv(os.path.join(exploration_data_dir, 'influencer_statistics.csv'))
    
    # Create more granular follower ranges
    follower_ranges = [0, 10000, 50000, 100000, 150000, 200000, 250000, 300000]
    range_labels = ['<10K', '10K-50K', '50K-100K', '100K-150K', '150K-200K', 
                   '200K-250K', '250K-300K']
    
    df['follower_range'] = pd.cut(df['followers'], 
                                 bins=follower_ranges, 
                                 labels=range_labels,
                                 ordered=True)
    
    # Get value counts in the specific order
    follower_dist = df['follower_range'].value_counts().reindex(range_labels)
    
    # Plot follower range distribution
    plt.figure(figsize=(15, 6))
    ax = follower_dist.plot(kind='bar')
    apply_chart_styling(ax, 
                       'Distribusi Jumlah Influencer Berdasarkan Jangkauan Jumlah Follower',
                       'Jangkauan Jumlah Follower', 
                       'Jumlah Influencer')
    add_bar_labels(ax, follower_dist.values)
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'follower_range_distribution.png'))
    plt.close()
    
    # Category distribution
    plt.figure(figsize=(12, 6))
    category_counts = df['category'].value_counts()
    ax = category_counts.plot(kind='bar')
    plt.title('Distribution of Influencer Categories')
    plt.xlabel('Category')
    plt.ylabel('Number of Influencers')
    plt.xticks(rotation=45)
    
    # Add count labels on top of each bar
    for i, v in enumerate(category_counts.values):
        ax.text(i, v + max(category_counts.values) * 0.01, str(v), 
                ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'category_distribution.png'))
    plt.close()
    
    # Correlation matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix of Numeric Variables')
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'correlation_matrix.png'))
    plt.close()
    
    # Create dictionary to store post counts per influencer
    influencer_post_counts = {}
    
    # Read and count posts from the clean data file
    with open(post_data_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                influencer_name = parts[1]
                if influencer_name in influencer_post_counts:
                    influencer_post_counts[influencer_name] += 1
                else:
                    influencer_post_counts[influencer_name] = 1
    
    # Add post counts to dataframe
    df['actual_posts'] = df['influencer_name'].map(influencer_post_counts)
    df['actual_posts'] = df['actual_posts'].fillna(0).astype(int)
    
    # Calculate total posts for each follower range using actual posts
    follower_range_posts = df.groupby('follower_range')['actual_posts'].sum().reindex(range_labels)
    
    # Plot follower range posts distribution
    plt.figure(figsize=(15, 6))
    ax = follower_range_posts.plot(kind='bar')
    apply_chart_styling(ax,
                       'Distribusi Jumlah Post Berdasarkan Jangkauan Follower',
                       'Jangkauan Follower',
                       'Jumlah Post')
    add_bar_labels(ax, follower_range_posts.values)
    
    plt.tight_layout()
    plt.savefig(os.path.join(exploration_data_dir, 'follower_range_posts_distribution.png'))
    plt.close()
    
    # Save summary to text file
    with open(os.path.join(exploration_data_dir, 'summary.txt'), 'w') as f:
        f.write("Influencer Data Summary\n")
        f.write("======================\n\n")
        f.write(f"Total number of influencers: {len(df)}\n")
        f.write(f"Number of unique categories: {df['category'].nunique()}\n\n")
        
        f.write("Detailed Statistics\n")
        f.write("-----------------\n\n")
        
        # Detailed statistics for followers
        f.write("Followers Statistics:\n")
        f.write(f"Mean: {df['followers'].mean():,.2f}\n")
        f.write(f"Std: {df['followers'].std():,.2f}\n")
        f.write(f"Min: {df['followers'].min():,.0f}\n")
        f.write(f"25%: {df['followers'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['followers'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['followers'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['followers'].max():,.0f}\n\n")
        
        # Detailed statistics for followees
        f.write("Followees Statistics:\n")
        f.write(f"Mean: {df['followees'].mean():,.2f}\n")
        f.write(f"Std: {df['followees'].std():,.2f}\n")
        f.write(f"Min: {df['followees'].min():,.0f}\n")
        f.write(f"25%: {df['followees'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['followees'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['followees'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['followees'].max():,.0f}\n\n")
        
        # Detailed statistics for posts
        f.write("Posts Statistics:\n")
        f.write(f"Mean: {df['posts'].mean():,.2f}\n")
        f.write(f"Std: {df['posts'].std():,.2f}\n")
        f.write(f"Min: {df['posts'].min():,.0f}\n")
        f.write(f"25%: {df['posts'].quantile(0.25):,.0f}\n")
        f.write(f"50% (Median): {df['posts'].quantile(0.5):,.0f}\n")
        f.write(f"75%: {df['posts'].quantile(0.75):,.0f}\n")
        f.write(f"Max: {df['posts'].max():,.0f}\n\n")
        
        f.write("\nFollower Range Distribution:\n")
        f.write(follower_dist.to_string())
        
        f.write("\n\nFollower Range Posts Distribution:\n")
        f.write("Total posts for each follower range:\n")
        f.write(follower_range_posts.to_string())
        
        f.write("\n\nCategory Distribution:\n")
        f.write(df['category'].value_counts().to_string())
        
        f.write("\nPost Distribution by Follower Range:\n")
        f.write("---------------------------------\n")
        for range_label in range_labels:
            total_posts = follower_range_posts[range_label]
            influencers = len(df[df['follower_range'] == range_label])
            f.write(f"{range_label}:\n")
            f.write(f"  Total Posts: {total_posts:,.0f}\n")
            f.write(f"  Number of Influencers: {influencers}\n")
            f.write(f"  Average Posts per Influencer: {total_posts/influencers if influencers > 0 else 0:,.1f}\n\n")
        
        f.write(f"\nTotal Posts Across All Ranges: {follower_range_posts.sum():,.0f}\n")
        f.write(f"Total Influencers: {len(df):,.0f}\n")
        f.write(f"Overall Average Posts per Influencer: {follower_range_posts.sum()/len(df):,.1f}\n")

In [5]:
explore_influencer_data(
    exploration_34000_data_after_sampling_clean_dir,
    influencers_34000_after_sampling_list_clean_path,
    post_34000_sampled_clean_info_path
)

  follower_range_posts = df.groupby('follower_range')['actual_posts'].sum().reindex(range_labels)
