# Data Labeling Classification

## Import Library

In [54]:
from tqdm import tqdm
import pandas as pd
import transformers
from transformers import pipeline
import matplotlib.pyplot as plt
import ast

## Data

### Load CSV

In [56]:
youtube_dataset = pd.read_csv("./data/youtube-comment-cleaned.csv")
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhir', 'pancing', 'wkwkw...",tukang korup akhir pancing wkwkw kasi keluarga...
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yg hadir tepuk tangan untuk...,"['bodohnya', 'ini', 'semua', 'yg', 'hadir', 't...","['bodohnya', 'semua', 'yg', 'hadir', 'tepuk', ...","['bodoh', 'semua', 'yg', 'hadir', 'tepuk', 'ta...",bodoh semua yg hadir tepuk tangan roki
2,Roki ini gaklama lagi\nJadi penhianat,roki ini gaklama lagi jadi penhianat,"['roki', 'ini', 'gaklama', 'lagi', 'jadi', 'pe...","['roki', 'gaklama', 'jadi', 'penhianat']","['roki', 'gaklama', 'jadi', 'penhianat']",roki gaklama jadi penhianat
3,Matador vs banteng dan bantengya silvester 😂,matador vs banteng dan bantengya silvester,"['matador', 'vs', 'banteng', 'dan', 'bantengya...","['matador', 'vs', 'banteng', 'bantengya', 'sil...","['matador', 'vs', 'banteng', 'bantengya', 'sil...",matador vs banteng bantengya silvester
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'jelas', 'kurang', 'wawasan', 'i...","['selverter', 'jelas', 'kurang', 'wawas', 'ilm...",selverter jelas kurang wawas ilmu bawah jauh r...


### Cleaning Confirmation

In [58]:
youtube_dataset = youtube_dataset.dropna()
youtube_dataset.isnull().sum()

comment              0
cleaned_comment      0
tokenized_comment    0
stopwords_comment    0
stemmed_comment      0
cleaned_stemmed      0
dtype: int64

## Classification Using Bigger Model

### Define Variable and Function

In [55]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True,
    truncation=True
)

def sentiment_analysis(text):
    result = distilled_student_sentiment_classifier(text)
    highest_score_label = max(result[0], key=lambda x: x['score'])['label']
    return highest_score_label



### Apply Function

In [59]:
youtube_dataset["sentiment_prediction"] = [sentiment_analysis(comment) for comment in tqdm(youtube_dataset['cleaned_stemmed'])]

youtube_dataset.head()

100%|██████████| 20655/20655 [14:25<00:00, 23.87it/s]


Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhir', 'pancing', 'wkwkw...",tukang korup akhir pancing wkwkw kasi keluarga...,negative
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yg hadir tepuk tangan untuk...,"['bodohnya', 'ini', 'semua', 'yg', 'hadir', 't...","['bodohnya', 'semua', 'yg', 'hadir', 'tepuk', ...","['bodoh', 'semua', 'yg', 'hadir', 'tepuk', 'ta...",bodoh semua yg hadir tepuk tangan roki,negative
2,Roki ini gaklama lagi\nJadi penhianat,roki ini gaklama lagi jadi penhianat,"['roki', 'ini', 'gaklama', 'lagi', 'jadi', 'pe...","['roki', 'gaklama', 'jadi', 'penhianat']","['roki', 'gaklama', 'jadi', 'penhianat']",roki gaklama jadi penhianat,negative
3,Matador vs banteng dan bantengya silvester 😂,matador vs banteng dan bantengya silvester,"['matador', 'vs', 'banteng', 'dan', 'bantengya...","['matador', 'vs', 'banteng', 'bantengya', 'sil...","['matador', 'vs', 'banteng', 'bantengya', 'sil...",matador vs banteng bantengya silvester,positive
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'jelas', 'kurang', 'wawasan', 'i...","['selverter', 'jelas', 'kurang', 'wawas', 'ilm...",selverter jelas kurang wawas ilmu bawah jauh r...,negative


## Save Classification CSV

In [62]:
youtube_dataset.to_csv("./data/youtube-comment-sentiment.csv", index=False)
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'akhir', 'pancing', 'wkwkw...",tukang korup akhir pancing wkwkw kasi keluarga...,negative
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yg hadir tepuk tangan untuk...,"['bodohnya', 'ini', 'semua', 'yg', 'hadir', 't...","['bodohnya', 'semua', 'yg', 'hadir', 'tepuk', ...","['bodoh', 'semua', 'yg', 'hadir', 'tepuk', 'ta...",bodoh semua yg hadir tepuk tangan roki,negative
2,Roki ini gaklama lagi\nJadi penhianat,roki ini gaklama lagi jadi penhianat,"['roki', 'ini', 'gaklama', 'lagi', 'jadi', 'pe...","['roki', 'gaklama', 'jadi', 'penhianat']","['roki', 'gaklama', 'jadi', 'penhianat']",roki gaklama jadi penhianat,negative
3,Matador vs banteng dan bantengya silvester 😂,matador vs banteng dan bantengya silvester,"['matador', 'vs', 'banteng', 'dan', 'bantengya...","['matador', 'vs', 'banteng', 'bantengya', 'sil...","['matador', 'vs', 'banteng', 'bantengya', 'sil...",matador vs banteng bantengya silvester,positive
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'jelas', 'kurang', 'wawasan', 'i...","['selverter', 'jelas', 'kurang', 'wawas', 'ilm...",selverter jelas kurang wawas ilmu bawah jauh r...,negative


### Cleaning Data

In [63]:
youtube_dataset_clean = youtube_dataset[["cleaned_stemmed", "sentiment_prediction"]]
youtube_dataset_clean.head()

Unnamed: 0,cleaned_stemmed,sentiment_prediction
0,tukang korup akhir pancing wkwkw kasi keluarga...,negative
1,bodoh semua yg hadir tepuk tangan roki,negative
2,roki gaklama jadi penhianat,negative
3,matador vs banteng bantengya silvester,positive
4,selverter jelas kurang wawas ilmu bawah jauh r...,negative


In [64]:
youtube_dataset_clean.to_csv("./data/youtube-comment-sentiment-cleaned.csv", index=False)

### Balance Data

In [67]:
check_sentiment = youtube_dataset_clean["sentiment_prediction"].value_counts()
min_count = check_sentiment.min()
dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
print(dataset_balanced["sentiment_prediction"].value_counts())

sentiment_prediction
negative    444
neutral     444
positive    444
Name: count, dtype: int64


  dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [68]:
dataset_balanced.to_csv("./data/youtube-comment-sentiment-balanced.csv", index=False)