In [None]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

# Model Training
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

# Regression 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# Classification 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


from sklearn.ensemble import RandomForestClassifier

# Clustering
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

In [None]:
path_dataset = '/content/gdrive/MyDrive/ai-playground/ra-text-classification-spectrogram/datasets/isot-dataset-cleaned-4000.csv'

In [None]:
df = pd.read_csv(path_dataset)

df_fake_news = df.loc[df['category'] == 0]
df_true_news = df.loc[df['category'] == 1]

df_fake_news_list = df_fake_news['text'].tolist()

df_true_news_list = df_true_news['text'].tolist()

In [None]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,text,category
0,21,21,NEW YORK (Reuters) U.S. Justice Department iss...,1
1,34,34,"KING PRUSSIA, Pennsylvania/WASHINGTON (Reuters...",1
2,51,51,WASHINGTON (Reuters) Republican-controlled U.S...,1
3,66,66,ATLANTA (Reuters) two Democratic candidates ru...,1
4,71,71,(Reuters) U.S. House Representatives gave fina...,1
...,...,...,...,...
2758,44877,23460,Patrick Henningsen 21st Century WireThere exis...,0
2759,44880,23463,Patrick Henningsen 21st Century Wire UPDATE: 1...,0
2760,44889,23472,Dady Chery Gilbert MercierAll writers desire r...,0
2761,44891,23474,Paul Craig RobertsIn last years 20th century f...,0


In [None]:
import re

#Removal of Punctuation Marks
def remove_punctuations(text):
    return re.sub('\[[^]]*\]', '', text)

# Removal of Special Characters
def remove_characters(text):
    return re.sub("[^a-zA-Z]"," ",text)

#Removal of stopwords 
def remove_stopwords_and_lemmatization(text):
    final_text = []
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    for word in text:
        if word not in set(stopwords.words('english')):
            lemma = nltk.WordNetLemmatizer()
            word = lemma.lemmatize(word) 
            final_text.append(word)
    return " ".join(final_text)

#Total function
def cleaning(text):
    text = remove_punctuations(text)
    text = remove_characters(text)
    text = remove_stopwords_and_lemmatization(text)
    return text

#Apply function on text column
df['text']=df['text'].apply(cleaning)


NameError: ignored

In [None]:
from simpletransformers.language_representation import RepresentationModel

model = RepresentationModel(
    model_type='bert',
    model_name='bert-base-uncased',
    use_cuda = True,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTextRepresentation: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from pandas._libs.tslibs.parsing import try_parse_date_and_time
fake_news_vectors = model.encode_sentences(df_fake_news_list, combine_strategy='mean')
true_news_vectors = model.encode_sentences(df_true_news_list, combine_strategy='mean')

In [None]:
fake_news_category = [0] * len(fake_news_vectors)
true_news_category = [1] * len(true_news_vectors)

In [None]:
import numpy as np

news_vectors = np.append(fake_news_vectors, true_news_vectors, axis = 0)
news_categories = np.append(fake_news_category, true_news_category)

In [None]:
from sklearn.model_selection import train_test_split

ratio_train = 0.8
ratio_test = 0.2

X_train, X_test, y_train, y_test = train_test_split(news_vectors, news_categories, test_size = 1 - ratio_train, shuffle = True)

In [None]:
def getPerformanceMetrics(test_y, pred_y):
  score_accuracy = accuracy_score(test_y, pred_y)
  score_recall = recall_score(test_y, pred_y)
  score_precision = precision_score(test_y, pred_y)
  score_f1 = f1_score(test_y, pred_y)

  return score_accuracy, score_recall, score_precision, score_f1

In [None]:
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)

perfmetrics_svm = getPerformanceMetrics(y_test, y_pred_svm)
perfmetrics_svm

(0.972875226039783, 0.9650793650793651, 0.987012987012987, 0.9759229534510434)

In [None]:
model_logreg = LogisticRegression(max_iter=200)
model_logreg.fit(X_train, y_train)
y_pred_logreg = model_logreg.predict(X_test)

perfmetrics_logreg = getPerformanceMetrics(y_test, y_pred_logreg)
perfmetrics_logreg

(0.9783001808318263, 0.9682539682539683, 0.993485342019544, 0.9807073954983923)

In [None]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

perfmetrics_dt = getPerformanceMetrics(y_test, y_pred_dt)
perfmetrics_dt

(0.8752260397830018,
 0.8507936507936508,
 0.9241379310344827,
 0.8859504132231405)

In [None]:
n_neighbours = len(np.unique(y_train)) + 1
model_knn = KNeighborsClassifier(n_neighbors=n_neighbours)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

perfmetrics_knn = getPerformanceMetrics(y_test, y_pred_knn)
perfmetrics_knn

(0.9240506329113924, 0.9492063492063492, 0.92, 0.934375)

In [None]:
model_rf = RandomForestClassifier(n_jobs = -1)
model_rf.fit(X_train, y_train)
pred_y_rf = model_rf.predict(X_test)

perfmetrics_rf = getPerformanceMetrics(y_test, pred_y_rf)
perfmetrics_rf

(0.9783001808318263,
 0.9777777777777777,
 0.9840255591054313,
 0.9808917197452229)

In [None]:
model_cnn = Sequential()
