In [None]:
#scrap yt
# STEP 1: Install library yang dibutuhkan
!pip install google-api-python-client youtube-transcript-api pandas openai-whisper yt-dlp openpyxl --quiet

# STEP 2: Import library
import pandas as pd
import os
import time
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import whisper

# STEP 3: API KEY YouTube
API_KEY = 'insert your api key'

# STEP 4: Fungsi pencarian video YouTube
def search_youtube_videos(keyword, max_results=5):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        request = youtube.search().list(
            q=keyword,
            part='snippet',
            type='video',
            maxResults=min(50, max_results - len(videos)),
            relevanceLanguage='id',
            videoCaption='any',
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            if 'videoId' in item['id']:
                video_id = item['id']['videoId']
                videos.append({
                    'video_id': video_id,
                    'title': item['snippet']['title'],
                    'link': f"https://www.youtube.com/watch?v={video_id}",
                    'channel_name': item['snippet']['channelTitle'],
                    'publish_date': item['snippet']['publishedAt']
                })

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return videos

# STEP 5: Fungsi ambil transkrip dari YouTube/Whisper
def get_video_transcript(video_id):
    try:
        # Ambil subtitle otomatis dari YouTube
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['id'])
        return " ".join([t['text'] for t in transcript])
    except Exception as e:
        print(f"Transkrip YouTube tidak tersedia: {e}")
        print("Menggunakan Whisper...")

        try:
            # Download audio
            audio_file = f"{video_id}.mp3"
            os.system(f'yt-dlp --quiet --extract-audio --audio-format mp3 -o "{video_id}.%(ext)s" https://www.youtube.com/watch?v={video_id}')

            # Transkripsi menggunakan Whisper
            if os.path.exists(audio_file):
                model = whisper.load_model("base")
                result = model.transcribe(audio_file)
                os.remove(audio_file)  # Hapus file audio setelah transkripsi
                return result['text']
            else:
                return "Audio file not found for Whisper."
        except Exception as whisper_error:
            return f"Whisper failed: {whisper_error}"

# STEP 6: Fungsi utama
def main(keyword):
    print(f"Mencari video dengan keyword: {keyword}")
    videos = search_youtube_videos(keyword)
    results = []

    for i, video in enumerate(videos, 1):
        print(f"\n[{i}] Judul: {video['title']}")
        text = get_video_transcript(video['video_id'])

        results.append({
            'No': i,
            'Link': video['link'],
            'Title': video['title'],
            'Text': text,
            'Channel': video['channel_name'],
            'Upload Date': video['publish_date']
        })

        time.sleep(5)  # Delay untuk menghindari error 429 (rate limit)

    df = pd.DataFrame(results)
    filename = 'youtube_results.xlsx'
    df.to_excel(filename, index=False)
    print(f"\n✅ Data tersimpan ke: {filename}")

    # Download di Colab
    from google.colab import files
    files.download(filename)

    return df

# STEP 7: Jalankan pencarian
keyword = "ikn ibu kota nusantara"
df = main(keyword)
df.head()


In [None]:
 #scrap Google News
import urllib.parse
import requests
import pandas as pd
from google.colab import files

# 1. Definisikan kata kunci yang ingin Anda cari
keywords = ['ikn']

for string in keywords:
    # 2. Inisialisasi list untuk menyimpan hasil
    title = []
    snippet = []
    url = []
    source = []
    date = []

    # 3. Sesuaikan rentang halaman yang ingin Anda scraping
    for i in [0, 100]:
        # 4. Parameter pencarian
        params = {
            'q': string,
            'tbm': "nws",
            'location': "Indonesia",
            'api_key': "insert your api key",
            "start" : i,
            'num': "100",
            'lr': "lang_id",  # 5. Menambahkan filter untuk bahasa Indonesia
            'tbs': "cdr:1,cd_min:1/1/2023,cd_max:29/08/2024"  # 6. Menambahkan filter untuk tahun 2023
        }

        # 7. Encode parameter
        param = urllib.parse.urlencode(params)

        # 8. Melakukan permintaan ke SERP API
        r = requests.get('https://serpapi.com/search.json?' + param)
        data = r.json()

        # 9. Periksa apakah ada hasil berita
        if 'news_results' in data:
            # 10. Iterasi melalui hasil berita
            for item in data['news_results']:
                # 11. Menyimpan hasil dalam list yang telah diinisialisasi
                url.append(item.get('link', ''))
                title.append(item.get('title', ''))
                snippet.append(item.get('snippet', ''))  # Menggunakan get() untuk mengambil nilai atau string kosong jika tidak ada
                source.append(item.get('source', ''))
                date.append(item.get('date', ''))

    # 12. Buat DataFrame dari data yang telah diambil
    d = {
        'no': range(1, len(url) + 1),  # Tambahkan kolom 'no' yang berisi angka berurutan
        'url': url,
        'title': title,
        'content': snippet,
        'source': source,
        'date': date,
    }
    df = pd.DataFrame(d)

    # 13. Tentukan nama file untuk file Excel
    filename = 'Keyword_' + string + '_2024.xlsx'

    # 14. Simpan DataFrame ke file Excel
    df.to_excel(filename, index=False)  # index=False untuk tidak menyimpan indeks default

    # 15. Unduh file Excel
    files.download(filename)


In [None]:
#preprocessing update
# Install library terlebih dahulu
!pip install openpyxl
!pip install Sastrawi
!pip install swifter

# Import library yang diperlukan
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
from google.colab import files

# Upload file Excel
uploaded = files.upload()
df = pd.read_excel(next(iter(uploaded.keys())))

# Cek apakah kolom 'text' atau 'content' ada di DataFrame
if 'Text' not in df.columns and 'content' not in df.columns:
    raise ValueError("Kolom 'text' atau 'content' tidak ditemukan pada file.")


# Fungsi Preprocessing
def case_folding(text):
    return text.lower() if isinstance(text, str) else ""

def filtering(text):
    return re.sub(r'[^a-zA-Z\s]', '', text) if isinstance(text, str) else ""

def tokenize(text):
    return text.split()  # Ganti word_tokenize dengan split biasa

def remove_stopwords(tokens):
    factory = StopWordRemoverFactory()
    stopword_remover = factory.create_stop_word_remover()
    filtered_sentence = stopword_remover.remove(' '.join(tokens))
    return filtered_sentence.split()

def stemming(tokens):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(word) for word in tokens]

def preprocess_text(text):
    try:
        folded_text = case_folding(text)
        filtered_text = filtering(folded_text)
        tokens = tokenize(filtered_text)
        stop_removed = remove_stopwords(tokens)
        stemmed = stemming(stop_removed)
        cleaned_text = ' '.join(stemmed)

        return {
            'Original Text': text,
            'Case Folding': folded_text,
            'Filtering': filtered_text,
            'Token': tokens,
            'Stopword': stop_removed,
            'Stemming': stemmed,
            'Cleaned Text': cleaned_text
        }
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return {
            'Original Text': text,
            'Case Folding': '',
            'Filtering': '',
            'Token': [],
            'Stopword': [],
            'Stemming': [],
            'Cleaned Text': ''
        }

# Tentukan kolom teks yang tersedia
text_column = 'text' if 'text' in df.columns else 'content'

# Terapkan preprocessing
preprocessed_df = df[text_column].astype(str).swifter.apply(preprocess_text).apply(pd.Series)

# Simpan hasil ke Excel
filename = 'hasil_preprocessed_' + list(uploaded.keys())[0]
preprocessed_df.to_excel(filename, index=False)
files.download(filename)

# Tampilkan 5 data pertama
preprocessed_df.head()


In [None]:
#TF-IDF

#pake visualisasi
import pandas as pd
import math
from collections import defaultdict
from google.colab import files
import io
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

# Unggah file Excel
uploaded = files.upload()

# Membaca data dari file Excel
for filename in uploaded.keys():
    df = pd.read_excel(io.BytesIO(uploaded[filename]), sheet_name='Sheet1')

# Misalnya teks yang telah di-stemming berada di kolom 'Cleaned Text'
documents = df['Cleaned Text'].astype(str).tolist()

# Menghitung Term Frequency (TF) untuk setiap dokumen
def compute_tf(text):
    tf_dict = defaultdict(int)
    words = text.split()
    for word in words:
        tf_dict[word] += 1
    total_words = len(words)
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / total_words
    return tf_dict

# Menghitung Document Frequency (DF) untuk semua dokumen
def compute_df(docs):
    df_dict = defaultdict(int)
    for doc in docs:
        unique_words = set(doc.split())
        for word in unique_words:
            df_dict[word] += 1
    return df_dict

# Menghitung Inverse Document Frequency (IDF) untuk semua dokumen menggunakan logaritma basis 10
def compute_idf(total_docs, df_dict):
    idf_dict = {}
    for word, df in df_dict.items():
        idf_dict[word] = math.log10(total_docs / df)
    return idf_dict

# Menghitung TF-IDF untuk setiap dokumen
def compute_tfidf(tf_dict, idf_dict):
    tfidf_dict = {}
    for word, tf in tf_dict.items():
        tfidf_dict[word] = tf * idf_dict.get(word, 0)
    return tfidf_dict

# Menghitung DF dan IDF secara global
total_docs = len(documents)
df_dict = compute_df(documents)
idf_dict = compute_idf(total_docs, df_dict)

# Proses menghitung TF, IDF, dan TF-IDF untuk setiap dokumen
tfidf_results = []
word_freq = defaultdict(int)  # Dictionary untuk menyimpan frekuensi kata

for i, doc in enumerate(documents, start=1):
    tf_dict = compute_tf(doc)
    tfidf_dict = compute_tfidf(tf_dict, idf_dict)
    for word in tf_dict:
        tfidf_results.append({
            'no': i,
            'kata': word,
            'TF': tf_dict[word],
            'DF': df_dict[word],
            'IDF': idf_dict[word],
            'TF-IDF': tfidf_dict[word]
        })
        word_freq[word] += tf_dict[word]  # Menambahkan frekuensi kata ke dictionary

# Mengubah hasil TF-IDF menjadi DataFrame
tfidf_df = pd.DataFrame(tfidf_results)

# Visualisasi 1: WordCloud untuk kata-kata yang paling sering muncul
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(10, 5))
plt.title("WordCloud dari Kata-Kata yang Paling Sering Muncul")
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Visualisasi 2: Bar plot untuk kata-kata dengan nilai TF-IDF tertinggi
top_n = 20  # Jumlah kata dengan nilai TF-IDF tertinggi yang ingin ditampilkan
top_tfidf_df = tfidf_df.sort_values(by="TF-IDF", ascending=False).head(top_n)

plt.figure(figsize=(10, 6))
sns.barplot(x="TF-IDF", y="kata", data=top_tfidf_df)
plt.title(f"Top {top_n} Kata dengan Nilai TF-IDF Tertinggi")
plt.xlabel("Nilai TF-IDF")
plt.ylabel("Kata")
plt.show()

# Menyimpan hasil ke file Excel baru
output_file_path = 'ok fix hasil new tfidf_output.xlsx'
tfidf_df.to_excel(output_file_path, index=False)

# Mengunduh file Excel
files.download(output_file_path)

print("TF-IDF berhasil dihitung, disimpan di", output_file_path, "dan divisualisasikan.")


In [None]:
#pelabelan


import pandas as pd
from google.colab import files
import io
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Langkah 1: Mengunggah file Excel untuk daftar kata sentimen
uploaded_sentiment = files.upload()

# Mengambil nama file Excel daftar kata sentimen yang diunggah
file_name_sentiment = list(uploaded_sentiment.keys())[0]

# Membaca file Excel daftar kata sentimen
sentiment_df = pd.read_excel(io.BytesIO(uploaded_sentiment[file_name_sentiment]))

# Menampilkan isi DataFrame untuk memastikan file terbaca dengan benar
print("Daftar Kata Sentimen:")
print(sentiment_df.head())

# Langkah 2: Mengunggah file teks untuk dilabeli sentimennya
uploaded_text = files.upload()

# Mengambil nama file teks yang diunggah
file_name_text = list(uploaded_text.keys())[0]

# Membaca file teks ke dalam DataFrame dan hanya memilih kolom 'Cleaned Text'
df_clean = pd.read_excel(io.BytesIO(uploaded_text[file_name_text]), usecols=['Cleaned Text'])

# Menampilkan isi DataFrame untuk memastikan file terbaca dengan benar
print("\nTeks yang Akan Dilabeli Sentimen:")
print(df_clean.head())

# Mengisi nilai NaN dengan string kosong
df_clean = df_clean.fillna('')

# Langkah 3: Definisi Fungsi untuk Menghitung Kata Sentimen dan Melabeli Dokumen Berdasarkan Polaritas dan Hate Speech
def calculate_polarity_and_hate_speech(text_df, sentiment_df):
    polarities = []
    hate_speech_counts = []
    positive_words = []
    negative_words = []
    hate_speech_words = []

    for index, row in text_df.iterrows():
        text = row['Cleaned Text']
        if isinstance(text, str):
            text = text.lower()
        else:
            text = ''

        polarity = 0
        hate_speech_count = 0
        pos_words = []
        neg_words = []
        hate_words = []

        for word in text.split():
            sentiment_row = sentiment_df[sentiment_df['word'] == word]
            if not sentiment_row.empty:
                sentiment = sentiment_row['sentiment'].values[0]
                if sentiment == 'positive':
                    polarity += 1
                    pos_words.append(word)
                elif sentiment == 'negative':
                    polarity -= 1
                    neg_words.append(word)
                elif sentiment == 'hate_speech':
                    hate_speech_count += 1
                    hate_words.append(word)

        polarities.append(polarity)
        hate_speech_counts.append(hate_speech_count)
        positive_words.append(" ".join(pos_words))
        negative_words.append(" ".join(neg_words))
        hate_speech_words.append(" ".join(hate_words))

    text_df['polarity'] = polarities
    text_df['hate_speech_counts'] = hate_speech_counts
    text_df['positive_words'] = positive_words
    text_df['negative_words'] = negative_words
    text_df['hate_speech_words'] = hate_speech_words
    return text_df

# Langkah 4: Memanggil Fungsi untuk Menghitung Polaritas dan Hate Speech
df_label = calculate_polarity_and_hate_speech(df_clean, sentiment_df)

# Langkah 5: Menentukan Label Sentimen dan Menghitung Jumlah Kata pada Masing-masing Kategori
for index, row in df_label.iterrows():
    polarity = row['polarity']
    hate_count = row['hate_speech_counts']

    if hate_count > 0:
        df_label.loc[index, 'label'] = 'hate_speech'
    elif polarity > 0:
        df_label.loc[index, 'label'] = 'positive'
    elif polarity < 0:
        df_label.loc[index, 'label'] = 'negative'
    else:
        df_label.loc[index, 'label'] = 'neutral'

df_label['positive_word_count'] = df_label['positive_words'].apply(lambda x: len(x.split()))
df_label['negative_word_count'] = df_label['negative_words'].apply(lambda x: len(x.split()))
df_label['hate_speech_word_count'] = df_label['hate_speech_words'].apply(lambda x: len(x.split()))

# Menampilkan hasil pelabelan
print("\nHasil Pelabelan Sentimen:")
print(df_label.head())

# Langkah 5.1: Menghitung Total Jumlah Masing-masing Label
label_counts = df_label['label'].value_counts()
print("\nTotal Jumlah Masing-masing Label:")
print(label_counts)

# Langkah 6: Menyimpan hasil ke file Excel dengan nama file input dari pengguna
output_file_name = input("Masukkan nama file untuk menyimpan hasil (contoh: 'Pelabelan_Hasil.xlsx'): ")

# Jika pengguna tidak menambahkan ekstensi .xlsx, tambahkan secara otomatis
if not output_file_name.endswith('.xlsx'):
    output_file_name += '.xlsx'

df_label.to_excel(output_file_name, index=False)

# Mengunduh file Excel yang sudah disimpan
files.download(output_file_name)

# Langkah 7: Menampilkan Grafik Distribusi Kelas dalam Bentuk Pie Chart
# Menghitung jumlah setiap label (sudah dilakukan di Langkah 5.1)
# Membuat pie chart
plt.figure(figsize=(10, 6))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
plt.title('Distribusi Kelas Sentimen')
plt.axis('equal')  # Memastikan pie chart berbentuk lingkaran
plt.show()

# Langkah 8: Membuat WordCloud untuk Masing-Masing Kelas
# WordCloud untuk Positive
positive_words = ' '.join(df_label['positive_words'].tolist())
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(positive_words)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud - Positive Words')
plt.show()

# WordCloud untuk Negative
negative_words = ' '.join(df_label['negative_words'].tolist())
wordcloud_neg = WordCloud(width=800, height=400, background_color='white').generate(negative_words)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud - Negative Words')
plt.show()

# WordCloud untuk Hate Speech
hate_speech_words = ' '.join(df_label['hate_speech_words'].tolist())
wordcloud_hate = WordCloud(width=800, height=400, background_color='white').generate(hate_speech_words)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_hate, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud - Hate Speech Words')
plt.show()
