In [None]:
! pip install tiktoken
! pip install matplotlib
! pip install wordcloud
! pip install nltk
! pip install pandas


In [1]:
import pandas as pd 
df = pd.read_csv("bbc.csv")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2234 entries, 0 to 2233
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2232 non-null   object
 1   text      2232 non-null   object
dtypes: object(2)
memory usage: 35.0+ KB


In [3]:
df.describe()

Unnamed: 0,category,text
count,2232,2232
unique,5,2124
top,business,spain coach faces racism inquiry spain s footb...
freq,512,2


In [4]:
label_counts = df['category'].value_counts()
print(label_counts)

category
business         512
sport            512
politics         421
tech             400
entertainment    387
Name: count, dtype: int64


In [5]:
len(df["text"])

2234

In [6]:
len(df["text"].unique())

2125

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
len(df["text"])

2126

In [9]:
null_counts = df.isnull().sum()
print("count of each value in each column:\n", null_counts)

count of each value in each column:
 category    2
text        2
dtype: int64


In [10]:
df.dropna(inplace=True)

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt

In [12]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\B00346\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\B00346\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
target_label = 'tech'
target_df = df [df['category']== target_label]

In [None]:
# Choose the label for which you want to create the word cloud
target_words = [word.lower() for text in target_df['text'] 
                for word in word_tokenize(text) 
                if word.isalnum() and word.lower() not in stopwords.words('english')]

target_word_counts = Counter(target_words)
print(target_word_counts)

In [None]:
word_cloud = WordCloud().generate_from_frequencies(target_word_counts)

In [None]:
plt.imshow(word_cloud,interpolation='bilinear')
plt.axis("off")
plt.title(f"World cloud for category: {target_label}")
plt.show()

In [None]:
import tiktoken

In [None]:
def cal_num_tokens_from_row(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    token_count = len(encoding.encode(string))
    return token_count

def cal_num_tokens_from_df(df, encoding_name: str) -> int:
    total_tokens = 0
    for text in df['text']:
        total_tokens += cal_num_tokens_from_row (text, encoding_name)
    return total_tokens


In [None]:
total_tokens = cal_num_tokens_from_df(df, "gpt-3.5-turbo")
print(f"total numbver of tokens in the dataframe: {total_tokens}")

In [None]:
import csv
import os 
import random

In [None]:
os.makedirs('dataset', exist_ok=True)

In [None]:
rows = [{'text': row['text'].strip(),
                    'label': row['category'],
                    } for idx, row in df.iterrows()]


In [None]:
random.seed(42)
random.shuffle(rows)

In [None]:
num_test = 500
splits = {'test':rows[0:num_test], 'train':rows[num_test:]}

In [None]:
for split in ['train', 'test']:
    with open(f'dataset/{split}.csv', 'w', newline='') as fOut:
        writer = csv.DictWriter(fOut, fieldnames=['text', 'label'])
        writer.writeheader()
        for row in splits[split]:
            writer.writerow(row)


In [26]:
import json
df = pd.read_csv("dataset/test.csv", encoding='unicode_escape')
final_df = df.head(500)
total_tokens = cal_num_tokens_from_df(final_df, "gpt-3.5-turbo")
print(f"total numbver of tokens in the dataframe: {total_tokens}")

total numbver of tokens in the dataframe: 73387


In [27]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    150 non-null    object
 1   label   150 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


In [29]:
system = "You are a intelligent assistant designed to classify news articles into these categories: business, entertainment, politics, sport, tech"
with open("test.jsonl", "w") as outfile:
    for _, row in final_df.iterrows():
        openai_format = {
            "messages": [
                {"role": "system", "content": system},
                {"role": "user", "content": row["text"]},
                {"role": "assistant", "content": row["label"]}
            ]
        }
        json.dump(openai_format, outfile)
        outfile.write('\n')    