In [22]:
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv("../data/raw/threads_data_indonesia.csv")
df.head()

Unnamed: 0,#,NAME,FOLLOWERS,ER,COUNTRY,TOPIC OF INFLUENCE,POTENTIAL REACH
0,1,Ridwan Kamil @ridwankamil,431K,1.11%,Indonesia,Politics,129.3K
1,2,"Deddy Corbuzier, Ph.D @mastercorbuzier",401.8K,0.67%,Indonesia,Entertainment & Music,120.5K
2,3,Raditya Dika @raditya_dika,367.3K,0.63%,Indonesia,Entertainment & Music Funny,110.2K
3,4,anya geraldine @anyageraldine,332.3K,0.62%,Indonesia,Product Showcase,99.7K
4,5,Luna Maya @lunamaya,313.3K,0.43%,Indonesia,Beauty & Self Care Entertainment & Music,94K


In [24]:
df.drop(columns=["#", "COUNTRY", "POTENTIAL REACH"], inplace=True)
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC OF INFLUENCE
0,Ridwan Kamil @ridwankamil,431K,1.11%,Politics
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401.8K,0.67%,Entertainment & Music
2,Raditya Dika @raditya_dika,367.3K,0.63%,Entertainment & Music Funny
3,anya geraldine @anyageraldine,332.3K,0.62%,Product Showcase
4,Luna Maya @lunamaya,313.3K,0.43%,Beauty & Self Care Entertainment & Music


In [25]:
df.isnull().sum()

NAME                  0
FOLLOWERS             0
ER                    0
TOPIC OF INFLUENCE    1
dtype: int64

In [26]:
print(df["TOPIC OF INFLUENCE"].value_counts())

TOPIC OF INFLUENCE
Entertainment & Music                                        17
Fashion & Accessories                                         8
Education                                                     7
Food                                                          6
Product Showcase                                              5
Sports                                                        4
Entertainment & Music Arts & Crafts                           4
Beauty & Self Care                                            4
Fitness & Health                                              3
Finance                                                       3
News                                                          2
Fashion & Accessories Product Showcase                        2
Travel                                                        2
Food Travel                                                   2
Entertainment & Music Media                                   1
Entertainment & Music

In [27]:
def map_topic_7(topic):
    topic = str(topic).lower()
    if any(word in topic for word in ["entertainment", "music", "funny", "drama", "art", "craft"]):
        return "Entertainment & Arts"
    elif any(word in topic for word in ["fashion", "beauty", "accessories", "self care"]):
        return "Fashion & Beauty"
    elif any(word in topic for word in ["food", "travel", "family", "life"]):
        return "Lifestyle & Travel"
    elif any(word in topic for word in ["education", "politics", "news", "society"]):
        return "Education & Society"
    elif any(word in topic for word in ["fitness", "health", "outdoor"]):
        return "Fitness & Health"
    elif any(word in topic for word in ["finance", "business", "tech", "technology"]):
        return "Tech & Finance"
    else:
        return "Others"

df["TOPIC_CATEGORY"] = df["TOPIC OF INFLUENCE"].apply(map_topic_7)


In [28]:
print(df["TOPIC_CATEGORY"].value_counts())


TOPIC_CATEGORY
Entertainment & Arts    32
Fashion & Beauty        16
Lifestyle & Travel      16
Others                  15
Education & Society     14
Tech & Finance           4
Fitness & Health         3
Name: count, dtype: int64


In [29]:
missing_topic = df[df["TOPIC OF INFLUENCE"].isna()]
print(missing_topic)

                                  NAME FOLLOWERS     ER TOPIC OF INFLUENCE  \
40  Benjamin Master Adhisurya @iben_ma     90.2K  1.22%                NaN   

   TOPIC_CATEGORY  
40         Others  


In [9]:
df.head(10)

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC OF INFLUENCE,TOPIC_CATEGORY
0,Ridwan Kamil @ridwankamil,431K,1.11%,Politics,Education & Society
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401.8K,0.67%,Entertainment & Music,Entertainment & Arts
2,Raditya Dika @raditya_dika,367.3K,0.63%,Entertainment & Music Funny,Entertainment & Arts
3,anya geraldine @anyageraldine,332.3K,0.62%,Product Showcase,Others
4,Luna Maya @lunamaya,313.3K,0.43%,Beauty & Self Care Entertainment & Music,Entertainment & Arts
5,Gisella Anastasia @gisel_la,234.4K,-,family,Lifestyle & Travel
6,Shopee Indonesia @shopee_id,231.1K,0.11%,Product Showcase,Others
7,Tasya Farasya @tasyafarasya,221.5K,0.24%,Beauty & Self Care,Fashion & Beauty
8,Dinda Hauw @dindahw,212.1K,0.82%,Beauty & Self Care,Fashion & Beauty
9,Cipeng | TIRTA @dr.tirta,186.7K,0.11%,Fitness & Health,Fitness & Health


In [30]:
df.drop(columns=["TOPIC OF INFLUENCE"], inplace=True)
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY
0,Ridwan Kamil @ridwankamil,431K,1.11%,Education & Society
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401.8K,0.67%,Entertainment & Arts
2,Raditya Dika @raditya_dika,367.3K,0.63%,Entertainment & Arts
3,anya geraldine @anyageraldine,332.3K,0.62%,Others
4,Luna Maya @lunamaya,313.3K,0.43%,Entertainment & Arts


In [31]:
print(df["TOPIC_CATEGORY"].value_counts())

TOPIC_CATEGORY
Entertainment & Arts    32
Fashion & Beauty        16
Lifestyle & Travel      16
Others                  15
Education & Society     14
Tech & Finance           4
Fitness & Health         3
Name: count, dtype: int64


In [32]:
def parse_number(value):
    if isinstance(value, str):
        value = value.strip().upper().replace(",", "")
        if 'M' in value:
            return float(value.replace('M', '')) * 1_000_000
        elif 'K' in value:
            return float(value.replace('K', '')) * 1_000
        else:
            try:
                return float(value)
            except:
                return None
    return value

df['FOLLOWERS'] = df['FOLLOWERS'].apply(parse_number)

In [33]:
def parse_percentage(value):
    if isinstance(value, str) and '%' in value:
        return float(value.replace('%', '')) / 100
    return value

df['ER'] = df['ER'].apply(parse_percentage)


In [34]:
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY
0,Ridwan Kamil @ridwankamil,431000.0,0.0111,Education & Society
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401800.0,0.0067,Entertainment & Arts
2,Raditya Dika @raditya_dika,367300.0,0.0063,Entertainment & Arts
3,anya geraldine @anyageraldine,332300.0,0.0062,Others
4,Luna Maya @lunamaya,313300.0,0.0043,Entertainment & Arts


In [35]:
ER = df["ER"].unique()
print(ER)

[0.0111 0.0067 0.0063 0.0062 0.0043 '-' 0.0011 0.0024 0.008199999999999999
 0.0172 0.004699999999999999 0.0018 0.0066 0.0023 0.0028000000000000004
 0.011000000000000001 0.003 0.0141 0.0118 0.0068000000000000005 0.0033
 0.0143 0.0138 0.0004 0.0070999999999999995 0.0139 0.0042 0.0097 0.0025
 0.0037 0.002 0.0052 0.016 0.0008 0.012199999999999999 0.0176
 0.0017000000000000001 0.005600000000000001 0.0012 0.004 0.0032 0.0016
 0.0053 0.006 0.0079 0.0213 0.0006 0.0021 0.0147 0.001 0.0015 0.0026
 0.0031 0.0154 0.006999999999999999 0.0076 0.0148 0.0069
 0.0034000000000000002 0.0019 0.0009 0.0014000000000000002 0.0013 0.0044
 0.0048 0.0074 0.015700000000000002 0.006500000000000001 0.0095 0.0131
 0.0002 0.0005]


In [36]:
df = df[df['ER'] != '-']
df['ER'] = pd.to_numeric(df['ER'])

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   NAME            96 non-null     object 
 1   FOLLOWERS       96 non-null     float64
 2   ER              96 non-null     float64
 3   TOPIC_CATEGORY  96 non-null     object 
dtypes: float64(2), object(2)
memory usage: 3.8+ KB


In [38]:
def classify_type(followers):
    if followers < 10_000:
        return 'Nano'
    elif followers < 50_000:
        return 'Micro'
    elif followers < 500_000:
        return 'Mid-tier'
    elif followers < 1_000_000:
        return 'Macro'
    else:
        return 'Mega'

df['TYPE_INFLUENCER'] = df['FOLLOWERS'].apply(classify_type)

In [39]:
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY,TYPE_INFLUENCER
0,Ridwan Kamil @ridwankamil,431000.0,0.0111,Education & Society,Mid-tier
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401800.0,0.0067,Entertainment & Arts,Mid-tier
2,Raditya Dika @raditya_dika,367300.0,0.0063,Entertainment & Arts,Mid-tier
3,anya geraldine @anyageraldine,332300.0,0.0062,Others,Mid-tier
4,Luna Maya @lunamaya,313300.0,0.0043,Entertainment & Arts,Mid-tier


In [40]:
rate_map = {
    'Nano': 500,       
    'Micro': 1000,      
    'Mid-tier': 2000, 
    'Macro': 3000,     
    'Mega': 5000       
}

df['CPE'] = df['TYPE_INFLUENCER'].map(rate_map)

In [41]:
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY,TYPE_INFLUENCER,CPE
0,Ridwan Kamil @ridwankamil,431000.0,0.0111,Education & Society,Mid-tier,2000
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401800.0,0.0067,Entertainment & Arts,Mid-tier,2000
2,Raditya Dika @raditya_dika,367300.0,0.0063,Entertainment & Arts,Mid-tier,2000
3,anya geraldine @anyageraldine,332300.0,0.0062,Others,Mid-tier,2000
4,Luna Maya @lunamaya,313300.0,0.0043,Entertainment & Arts,Mid-tier,2000


In [43]:
df['ENGAGEMENT'] = (df['ER'] * df['FOLLOWERS']) 

In [45]:
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY,TYPE_INFLUENCER,CPE,ENGAGEMENT
0,Ridwan Kamil @ridwankamil,431000.0,0.0111,Education & Society,Mid-tier,2000,4784.1
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401800.0,0.0067,Entertainment & Arts,Mid-tier,2000,2692.06
2,Raditya Dika @raditya_dika,367300.0,0.0063,Entertainment & Arts,Mid-tier,2000,2313.99
3,anya geraldine @anyageraldine,332300.0,0.0062,Others,Mid-tier,2000,2060.26
4,Luna Maya @lunamaya,313300.0,0.0043,Entertainment & Arts,Mid-tier,2000,1347.19


In [46]:
df['ESTIMATED_COST'] = df['ENGAGEMENT'] * df['CPE']
df.head()

Unnamed: 0,NAME,FOLLOWERS,ER,TOPIC_CATEGORY,TYPE_INFLUENCER,CPE,ENGAGEMENT,ESTIMATED_COST
0,Ridwan Kamil @ridwankamil,431000.0,0.0111,Education & Society,Mid-tier,2000,4784.1,9568200.0
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",401800.0,0.0067,Entertainment & Arts,Mid-tier,2000,2692.06,5384120.0
2,Raditya Dika @raditya_dika,367300.0,0.0063,Entertainment & Arts,Mid-tier,2000,2313.99,4627980.0
3,anya geraldine @anyageraldine,332300.0,0.0062,Others,Mid-tier,2000,2060.26,4120520.0
4,Luna Maya @lunamaya,313300.0,0.0043,Entertainment & Arts,Mid-tier,2000,1347.19,2694380.0


In [48]:
df.drop(columns=["FOLLOWERS"], inplace=True)
df.head()

Unnamed: 0,NAME,ER,TOPIC_CATEGORY,TYPE_INFLUENCER,CPE,ENGAGEMENT,ESTIMATED_COST
0,Ridwan Kamil @ridwankamil,0.0111,Education & Society,Mid-tier,2000,4784.1,9568200.0
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",0.0067,Entertainment & Arts,Mid-tier,2000,2692.06,5384120.0
2,Raditya Dika @raditya_dika,0.0063,Entertainment & Arts,Mid-tier,2000,2313.99,4627980.0
3,anya geraldine @anyageraldine,0.0062,Others,Mid-tier,2000,2060.26,4120520.0
4,Luna Maya @lunamaya,0.0043,Entertainment & Arts,Mid-tier,2000,1347.19,2694380.0


In [49]:
df.drop(columns=["ER"], inplace=True)
df.head()

Unnamed: 0,NAME,TOPIC_CATEGORY,TYPE_INFLUENCER,CPE,ENGAGEMENT,ESTIMATED_COST
0,Ridwan Kamil @ridwankamil,Education & Society,Mid-tier,2000,4784.1,9568200.0
1,"Deddy Corbuzier, Ph.D @mastercorbuzier",Entertainment & Arts,Mid-tier,2000,2692.06,5384120.0
2,Raditya Dika @raditya_dika,Entertainment & Arts,Mid-tier,2000,2313.99,4627980.0
3,anya geraldine @anyageraldine,Others,Mid-tier,2000,2060.26,4120520.0
4,Luna Maya @lunamaya,Entertainment & Arts,Mid-tier,2000,1347.19,2694380.0


In [None]:
df.drop(columns=["TYPE_INFLUENCER", "CPE"], inplace=True)
df.head()

In [52]:
output_path = '../data/fix/threads_cleaned.csv'  # Sesuaikan nama file sesuai platform
df.to_csv(output_path, index=False)