# Data Labeling Classification

## Import Library

In [5]:
from tqdm import tqdm
import pandas as pd
import transformers
from transformers import pipeline
import matplotlib.pyplot as plt
import ast

## Data

### Load CSV

In [6]:
youtube_dataset = pd.read_csv("../data/youtube-comment-cleaned.csv")
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'kepancing', 'kasian', 'ke...","['tukang', 'korup', 'pancing', 'kasi', 'keluar...",tukang korup pancing kasi keluarga anak
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yang hadir tepuk tangan unt...,"['bodohnya', 'ini', 'semua', 'yang', 'hadir', ...","['bodohnya', 'hadir', 'tepuk', 'tangan', 'rocky']","['bodoh', 'hadir', 'tepuk', 'tangan', 'rocky']",bodoh hadir tepuk tangan rocky
2,Roki ini gaklama lagi\nJadi penhianat,rocky ini tidak lama lagi jadi penhianat,"['rocky', 'ini', 'tidak', 'lama', 'lagi', 'jad...","['rocky', 'penhianat']","['rocky', 'penhianat']",rocky penhianat
3,Matador vs banteng dan bantengya silvester 😂,matador versus banteng dan bantengya silfester,"['matador', 'versus', 'banteng', 'dan', 'bante...","['matador', 'versus', 'banteng', 'bantengya', ...","['matador', 'versus', 'banteng', 'bantengya', ...",matador versus banteng bantengya silfester
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'wawasan', 'ilmu', 'rokcy']","['selverter', 'wawas', 'ilmu', 'rokcy']",selverter wawas ilmu rokcy


### Cleaning Confirmation

In [7]:
youtube_dataset = youtube_dataset.dropna()
youtube_dataset.isnull().sum()

comment              0
cleaned_comment      0
tokenized_comment    0
stopwords_comment    0
stemmed_comment      0
cleaned_stemmed      0
dtype: int64

## Classification Using Bigger Model

### Define Variable and Function

In [8]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    return_all_scores=True,
    truncation=True
)

def sentiment_analysis(text):
    result = distilled_student_sentiment_classifier(text)
    highest_score_label = max(result[0], key=lambda x: x['score'])['label']
    return highest_score_label



### Apply Function

In [9]:
youtube_dataset["sentiment_prediction"] = [sentiment_analysis(comment) for comment in tqdm(youtube_dataset['cleaned_stemmed'])]

youtube_dataset.head()

100%|██████████| 20403/20403 [07:16<00:00, 46.72it/s]


Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'kepancing', 'kasian', 'ke...","['tukang', 'korup', 'pancing', 'kasi', 'keluar...",tukang korup pancing kasi keluarga anak,negative
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yang hadir tepuk tangan unt...,"['bodohnya', 'ini', 'semua', 'yang', 'hadir', ...","['bodohnya', 'hadir', 'tepuk', 'tangan', 'rocky']","['bodoh', 'hadir', 'tepuk', 'tangan', 'rocky']",bodoh hadir tepuk tangan rocky,negative
2,Roki ini gaklama lagi\nJadi penhianat,rocky ini tidak lama lagi jadi penhianat,"['rocky', 'ini', 'tidak', 'lama', 'lagi', 'jad...","['rocky', 'penhianat']","['rocky', 'penhianat']",rocky penhianat,positive
3,Matador vs banteng dan bantengya silvester 😂,matador versus banteng dan bantengya silfester,"['matador', 'versus', 'banteng', 'dan', 'bante...","['matador', 'versus', 'banteng', 'bantengya', ...","['matador', 'versus', 'banteng', 'bantengya', ...",matador versus banteng bantengya silfester,negative
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'wawasan', 'ilmu', 'rokcy']","['selverter', 'wawas', 'ilmu', 'rokcy']",selverter wawas ilmu rokcy,negative


## Save Classification CSV

In [10]:
youtube_dataset.to_csv("../data/youtube-comment-cleaned-sentiment.csv", index=False)
youtube_dataset.head()

Unnamed: 0,comment,cleaned_comment,tokenized_comment,stopwords_comment,stemmed_comment,cleaned_stemmed,sentiment_prediction
0,Tukang korup akhirnya kepancing wkwkw kasian k...,tukang korup akhirnya kepancing wkwkw kasian k...,"['tukang', 'korup', 'akhirnya', 'kepancing', '...","['tukang', 'korup', 'kepancing', 'kasian', 'ke...","['tukang', 'korup', 'pancing', 'kasi', 'keluar...",tukang korup pancing kasi keluarga anak,negative
1,Bodohnya ini semua yg hadir tepuk tangan untuk...,bodohnya ini semua yang hadir tepuk tangan unt...,"['bodohnya', 'ini', 'semua', 'yang', 'hadir', ...","['bodohnya', 'hadir', 'tepuk', 'tangan', 'rocky']","['bodoh', 'hadir', 'tepuk', 'tangan', 'rocky']",bodoh hadir tepuk tangan rocky,negative
2,Roki ini gaklama lagi\nJadi penhianat,rocky ini tidak lama lagi jadi penhianat,"['rocky', 'ini', 'tidak', 'lama', 'lagi', 'jad...","['rocky', 'penhianat']","['rocky', 'penhianat']",rocky penhianat,positive
3,Matador vs banteng dan bantengya silvester 😂,matador versus banteng dan bantengya silfester,"['matador', 'versus', 'banteng', 'dan', 'bante...","['matador', 'versus', 'banteng', 'bantengya', ...","['matador', 'versus', 'banteng', 'bantengya', ...",matador versus banteng bantengya silfester,negative
4,Selverter itu jelas kurang wawasan dan ilmu di...,selverter itu jelas kurang wawasan dan ilmu di...,"['selverter', 'itu', 'jelas', 'kurang', 'wawas...","['selverter', 'wawasan', 'ilmu', 'rokcy']","['selverter', 'wawas', 'ilmu', 'rokcy']",selverter wawas ilmu rokcy,negative


### Balance Sentiment Data

In [11]:
check_sentiment = youtube_dataset["sentiment_prediction"].value_counts()
min_count = check_sentiment.min()
dataset_balanced = youtube_dataset.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
print(dataset_balanced["sentiment_prediction"].value_counts())

sentiment_prediction
negative    57
neutral     57
positive    57
Name: count, dtype: int64


  dataset_balanced = youtube_dataset.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [12]:
dataset_balanced.to_csv("../data/youtube-comment-cleaned-sentiment-balanced.csv", index=False)

### Reformating Column

In [13]:
youtube_dataset_clean = youtube_dataset[["cleaned_stemmed", "sentiment_prediction"]]
youtube_dataset_clean.head()

Unnamed: 0,cleaned_stemmed,sentiment_prediction
0,tukang korup pancing kasi keluarga anak,negative
1,bodoh hadir tepuk tangan rocky,negative
2,rocky penhianat,positive
3,matador versus banteng bantengya silfester,negative
4,selverter wawas ilmu rokcy,negative


In [14]:
youtube_dataset_clean.to_csv("../data/youtube-comment-cleaned-sentiment-reformat.csv", index=False)

### Balance Data After Reformat

In [15]:
check_sentiment = youtube_dataset_clean["sentiment_prediction"].value_counts()
min_count = check_sentiment.min()
dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
print(dataset_balanced["sentiment_prediction"].value_counts())

sentiment_prediction
negative    57
neutral     57
positive    57
Name: count, dtype: int64


  dataset_balanced = youtube_dataset_clean.groupby('sentiment_prediction').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


In [16]:
dataset_balanced.to_csv("../data/youtube-comment-cleaned-sentiment-reformat-balanced.csv", index=False)