# Deteksi Berita Hoax

### Import Library yang dibutuhkan

In [None]:
#Import module yang diperlukan
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np
import re
import string
from sklearn import preprocessing
from google.colab import drive
from matplotlib import style
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier

### Masukan Data Hoax dan Fakta

In [None]:
#baca drive kita
drive.mount('/content/drive')

In [None]:
#Upload file dari google drive
df = pd.read_excel("/content/drive/MyDrive/Data Berita Hoax/berita.xlsx")

In [None]:
#Cek head data hoax
df.head(10)

In [None]:
#Case Folding
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(r"\d+", "", text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df["berita"] = df["berita"].apply(wordopt)
df.head(10)

In [None]:
pip install Sastrawi

In [None]:
#Cari stopword bahasa indonesia menggunakan modul sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()
stopwords = factory.get_stop_words()
print(stopwords)

In [None]:
#Tokenizing
def tokenizing(text):
  text = text.split()
  return text

df["berita"] = df["berita"].apply(tokenizing)
df.head(10)

In [None]:
#Filtering
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

def wordopt(text):
    text = re.sub('tersebut*', '', text)
    text = re.sub('tidak', '', text)
    return text

df["berita"] = df["berita"].apply(wordopt)
df["berita"] = df["berita"].apply(stopword.remove)
df.head(10)

In [None]:
#hapus duplikat berita jika ada
bool_series = df.duplicated(subset='berita')

print('Boolean series:')
print(bool_series)
print('\n')
print('DataFrame after removing duplicates found in the Name column:')
df[~bool_series]

In [None]:
#ubah label menjadi numerik, Fakta = 0 dan hoaks = 1
df['tagging'].replace(['Valid', 'Hoax'],[0, 1], inplace=True)
df.head(10)

In [None]:
#Cek jumlah berita hoaks
df_hoax = df[df["tagging"] == 1]
df_hoax

In [None]:
#Word cloud kata berita
from wordcloud import WordCloud
all_word = ' '.join(df["berita"])
wordcloud = WordCloud(width = 800, height = 500, max_font_size = 110, collocations = False).generate(all_word)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis("off")
plt.title("Berita")
plt.show()

In [None]:
#Definisikan x dan y
x = df["berita"]
y = df["tagging"]

In [None]:
#TF-IDF
vectorization = TfidfVectorizer()
xt = vectorization.fit_transform(x)

In [None]:
#normalisasi data
xt1 = preprocessing.normalize(xt.toarray())
xt1[5:15,100:110]

In [None]:
#Cek penempatan kolom kata kunci
vocabulary = vectorization.vocabulary_
print(vocabulary)

In [None]:
#Lakukan SMOTE
smote = SMOTETomek(1)

# implementasi oversampling
x_sm, y_sm = smote.fit_resample(xt1, y)

y_sm.value_counts()

In [None]:
#Buat grafik SMOTE
style.use('ggplot')

x = [0, 1]
y = [348, 348]

fig, ax = plt.subplots()

ax.bar(x, y, align='center', color = 'red')

ax.set_title('Data Berita')
ax.set_ylabel('Banyak Berita')
ax.set_xlabel('Label')

ax.set_xticks(x)
ax.set_xticklabels(("Fakta", "Hoaks"))

plt.show()

In [None]:
#Pemisahan dataset
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size=0.2)

##Random Forest Classifier

In [None]:
#Modelling RFC
RFC = RandomForestClassifier(criterion = 'gini')
RFC.fit(x_train, y_train)

In [None]:
#Hasil prediksi
pred_rfc = RFC.predict(x_test)
pred_rfc

In [None]:
#Cek akurasi
RFC.score(x_test, y_test)

In [None]:
#cek precision, recall, dan f1-score
print(classification_report(y_test, pred_rfc))

In [None]:
#Membuat confussion matrix
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
      cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
      print("Normalized confusion matrix")
    else:
      print('Confusion matrix, without normalization')
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
      plt.text(j, i, format(cm[i, j], fmt),
               horizontalalignment="center",
               color="white" if cm[i, j] > thresh else "black")
      
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
print(confusion_matrix(y_test, pred_rfc, labels=[1,0]))

In [None]:
#Buat gambar confussion matrix
cnf_matrix = confusion_matrix(y_test, pred_rfc, labels=[1,0])
plt.figure()
plot_confusion_matrix(cnf_matrix,classes=['Hoax = 1', 'Fakta = 0'],normalize = False, title='Confusion matrix')

In [None]:
#Fungsi untuk manual testing
def output_lable(n):
    if n == 0:
        return "Fakta"
    elif n == 1:
        return "Hoaks"
    
def manual_testing(news):
    testing_news = {"berita":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["berita"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\nRFC Prediction: {} ".format(output_lable(pred_RFC[0])))

In [None]:
#Implementasi manual testing
news = str(input())
news = news.lower()
#news = stemmer.stem(news)
news = re.sub('\[.*?\]', '', news)
news = re.sub(r"\d+", "", news)
news = re.sub("\\W"," ",news) 
news = re.sub('https?://\S+|www\.\S+', '', news)
news = re.sub('<.*?>+', '', news)
news = re.sub('[%s]' % re.escape(string.punctuation), '', news)
news = re.sub('\n', '', news)
news = re.sub('\w*\d\w*', '', news)

print(news)
manual_testing(news)

dokter hewan dari lab balai uji standar karantina ikan  bkipm kementerian kelautan dan perikanan  drh m aji purbayu mengatakan bahwa bintik putih pada ikan lele tersebut bukan mengindikasikan ada cacing di dalamnya  melainkan parasit jenis protozoa  aji mengungkapkan  cysta parasit protozoa pada ikan lele tersebut berjenis ichtyophthirius multifilis atau dikenal sebagai parasit penyebab penyakit white spot pada ikan  parasit tersebut tidak bersifat zoonosis  tidak menular ke manusia  dan akan mati pada pemanasan atau pemasakan ikan hingga matang  jika menemukan ikan dengan kondisi seperti di atas  aji mengatakan  masih bisa dikonsumsi  dengan catatan  diolah atau dimasak dengan benar benar matang 

RFC Prediction: Fakta 
