In [3]:
import pandas as pd
import json

In [4]:
with open('./data/labeled_articles.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)
df['text'] = df['title'] + ' ' + df['body']
df = df[['text', 'sector', 'subsector']]
df

Unnamed: 0,text,sector,subsector
0,SAPX secures revolving loan of IDR 125 billion...,transportation-logistic,logistics-deliveries
1,Waskita Karya to inject IDR 10 billion for Kap...,infrastructures,transportation-infrastructure
2,Trimegah Bangun Persada to raise buyback budge...,financials,investment-service
3,"Continuing to release Totalindo (TOPS) shares,...",infrastructures,heavy-constructions-civil-engineering
4,"Distribute financing to 900 thousand MSMEs, Ko...",technology,software-it-services
...,...,...,...
85,PBRX in PKPU status Commercial Court of Centra...,industrials,industrial-goods
86,Kedoya Adyaraya to set up final dividend of ID...,healthcare,healthcare-equipment-providers
87,Tan John Tanuwijaya now only owns 62.45% of Be...,financials,investment-service
88,Net foreign volume stuck in red zone with -1.2...,technology,software-it-services


In [5]:
df.describe()

Unnamed: 0,text,sector,subsector
count,90,90,90
unique,90,11,24
top,SAPX secures revolving loan of IDR 125 billion...,financials,investment-service
freq,1,16,12


In [6]:
with open('./data/subsectors.json', 'r') as f:
    data = json.load(f)
subsectors_data = pd.DataFrame(data)

sectors = subsectors_data['sector'].drop_duplicates()
sectors


0             infrastructures
1                      energy
2                  financials
4          consumer-cyclicals
5                  technology
10                industrials
13     consumer-non-cyclicals
14    transportation-logistic
16                 healthcare
21            basic-materials
24     properties-real-estate
Name: sector, dtype: object

In [7]:
with open('./data/idnarticles400.json', 'r') as f:
    data400 = json.load(f)

# Flatten nested lists
def flatten_articles(raw_data):
    articles = []
    def recursive_extract(data):
        if isinstance(data, dict) and 'title' in data:
            articles.append(data)
        elif isinstance(data, list):
            for item in data:
                recursive_extract(item)
        elif isinstance(data, dict):
            for key, value in data.items():
                recursive_extract(value)

    recursive_extract(raw_data)
    return articles

# Flatten the articles data
articles = flatten_articles(data400)

df400 = pd.DataFrame(articles)
df400.describe()

Unnamed: 0,title,body,source,timestamp
count,3920,3920,3920,3920
unique,3907,3910,3910,3907
top,"Alexander Ramlie’s estate planning revealed, A...",The stock price of PT Amman Mineral Internasio...,https://www.idnfinancials.com/news/50166/alexa...,2024-06-26 14:55:39
freq,2,2,2,2


In [8]:
with open('./data/labeled_articles400.json', 'r') as f:
    data400 = json.load(f)

dflabel400 = pd.DataFrame(data400)
dflabel400 = dflabel400[dflabel400.sector != 'unknown']
dflabel400['sector'].value_counts()

sector
financials                 2099
infrastructures             449
transportation-logistic     110
consumer-cyclicals          109
energy                       89
properties-real-estate       78
industrials                  77
consumer-non-cyclicals       75
technology                   49
basic-materials              29
healthcare                   28
Name: count, dtype: int64

In [11]:
# Predict sectors using Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import re
import pandas as pd
import json

# Opening the source
with open('./data/labeled_articles.json', 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)
df['text'] = df['title'] + ' ' + df['body']
df = df[['text', 'sector']]

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

df['text'] = df['text'].apply(preprocess_text)

# Encode the labels
label_encoder = LabelEncoder()
df['sector'] = label_encoder.fit_transform(df['sector'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sector'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if the default 100 is not enough for convergence
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Print the first few rows of the dataframe
print(df.head())

Accuracy: 0.3333333333333333
                                                text  sector
0  sapx secures revolving loan of idr 125 billion...      10
1  waskita karya to inject idr 10 billion for kap...       7
2  trimegah bangun persada to raise buyback budge...       4
3  continuing to release totalindo tops shares de...       7
4  distribute financing to 900 thousand msmes koi...       9
