In [63]:
pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [64]:
import pandas as pd
import numpy as np

import os

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

from tqdm import tqdm 

import re
import string

from textblob import TextBlob

In [6]:
# Specify the path to the directory containing your CSV files
folder_path = 'dataset/'

# Initialize an empty list to store DataFrames
dfs = []

# Iterate through each file in the specified directory
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        # Construct the full path to the CSV file
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)


In [14]:
combined_df

Unnamed: 0,publishedAt,authorDisplayName,textDisplay,likeCount
0,2023-12-07T13:51:52Z,@idaesti1833,Semoga terwujud Aamiim presiden,0.0
1,2023-12-07T10:50:47Z,@smithwerben1058,MANTAPPPP,0.0
2,2023-12-07T05:37:35Z,@MahendraDatta-fi1vz,"Anies cocok jadi Presiden 8<br>Ganteng, amana...",0.0
3,2023-12-07T03:35:32Z,@CaturWidodo-lh2je,jujur demi Allah biarkan kalian mengoceh yang ...,0.0
4,2023-12-06T23:18:54Z,@asepsodikin6801,Sy dukung prabowa krn wakilnya bkan saatnya sk...,0.0
...,...,...,...,...
135179,2023-10-25T11:01:26Z,Gus Wedi,"Woo...woo , ditipu berkali kali kok masih perc...",1.0
135180,2023-10-25T10:56:53Z,Singa Anom,"Hoax lagi.....................<a href=""about:i...",0.0
135181,2023-10-25T10:56:12Z,Rakyat Jelata,"Nggak salah kok pak, biar sama-sama nyungsep n...",3.0
135182,2023-10-25T11:29:48Z,Anak Terpelajar,Amin,0.0


In [15]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135184 entries, 0 to 135183
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   publishedAt        135184 non-null  object 
 1   authorDisplayName  135113 non-null  object 
 2   textDisplay        135083 non-null  object 
 3   likeCount          135052 non-null  float64
dtypes: float64(1), object(3)
memory usage: 4.1+ MB


In [17]:
combined_df.head()

Unnamed: 0,publishedAt,authorDisplayName,textDisplay,likeCount
0,2023-12-07T13:51:52Z,@idaesti1833,Semoga terwujud Aamiim presiden,0.0
1,2023-12-07T10:50:47Z,@smithwerben1058,MANTAPPPP,0.0
2,2023-12-07T05:37:35Z,@MahendraDatta-fi1vz,"Anies cocok jadi Presiden 8<br>Ganteng, amana...",0.0
3,2023-12-07T03:35:32Z,@CaturWidodo-lh2je,jujur demi Allah biarkan kalian mengoceh yang ...,0.0
4,2023-12-06T23:18:54Z,@asepsodikin6801,Sy dukung prabowa krn wakilnya bkan saatnya sk...,0.0


In [35]:
df = combined_df.drop(columns=['authorDisplayName','likeCount'])

In [36]:
df

Unnamed: 0,publishedAt,textDisplay
0,2023-12-07T13:51:52Z,Semoga terwujud Aamiim presiden
1,2023-12-07T10:50:47Z,MANTAPPPP
2,2023-12-07T05:37:35Z,"Anies cocok jadi Presiden 8<br>Ganteng, amana..."
3,2023-12-07T03:35:32Z,jujur demi Allah biarkan kalian mengoceh yang ...
4,2023-12-06T23:18:54Z,Sy dukung prabowa krn wakilnya bkan saatnya sk...
...,...,...
135179,2023-10-25T11:01:26Z,"Woo...woo , ditipu berkali kali kok masih perc..."
135180,2023-10-25T10:56:53Z,"Hoax lagi.....................<a href=""about:i..."
135181,2023-10-25T10:56:12Z,"Nggak salah kok pak, biar sama-sama nyungsep n..."
135182,2023-10-25T11:29:48Z,Amin


In [37]:
df.isnull().sum()

publishedAt      0
textDisplay    101
dtype: int64

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135184 entries, 0 to 135183
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   publishedAt  135184 non-null  object
 1   textDisplay  135083 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


In [42]:
df_cleaned = df.dropna()

# Tampilkan informasi tentang DataFrame yang sudah dibersihkan
print("Info for Cleaned DataFrame:")
df_cleaned.isnull().sum()

Info for Cleaned DataFrame:


publishedAt    0
textDisplay    0
dtype: int64

## Tahapan 2 Prepocessing
1. stopword(Menghapus tanda @,http link,hastag,tanda baca petik, tanda seru, tanda tanya, spasi, nomor, emoji)
2. Lower Case
3. Tokenisasi

In [60]:
clean_texts = []


stop_factory = StopWordRemoverFactory().get_stop_words() #load defaul stopword

for text in tqdm(df_cleaned['textDisplay']):
    text = re.sub("@[A-Za-z0-9]+","",text)
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) #Remove http links
    text = text.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    text = re.sub(r"\d+", "", text) # Remove number
    text = text.replace('"','') #remove quotation mark
    text = text.lower() #Lower Case
    text = text.strip() # Remove Whitespace
    text = text.translate(str.maketrans("","",string.punctuation)) #Remove Punctuation
    
    # Menghapus Stopword
    dictionary = ArrayDictionary(stop_factory)
    swr = StopWordRemover(dictionary)
    text = swr.remove(text)
    clean_texts.append(text)

100%|███████████████████████████████████████████████████████████████████████| 135083/135083 [00:08<00:00, 16434.80it/s]


In [61]:
df_cleaned['clear'] = clean_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['clear'] = clean_texts


Klasifikasi sentimen ada 2 bagian, yaitu polarity dan subjectivity. Dengan function print, kita dapat melihat kedua hasil tersebut. Polarity digunakan untuk melihat seberapa positif atau negatif sebuah teks, dan subjectivity untuk melihat value dari tweet itu adalah opini atau faktual. Semakin tinggi subjectivity berarti tweet tersebut dapat dikatakan sebagai opini, sedangkan semakin tinggi polarity, itu menandakan feel/emotion yang semakin positif juga dari sebuah tweet.

In [67]:
clean_texts.info()


AttributeError: 'list' object has no attribute 'info'