In [None]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import data
data = pd.read_csv(r"C:\Users\User\Desktop\fakenews_dataset20210819.csv")
data.head()

In [None]:
# Get info of data
data.info()

In [None]:
# Count the missing value in every column
data.isnull().sum()

In [None]:
# Get percentage count of loanStatus
data['target'].value_counts(normalize=True)

The dataset is a imbalance data because most of the data are under label 0. 

In [None]:
data = data.drop('id', axis=1)
data = data.drop('news_url',axis=1)

# Data Exploration

In [None]:
def boxplot(column):
    sns.boxplot(x = 'target',
            y=column,
            data=data)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
#Save boxplot of all columns to pdf
with PdfPages('boxplot.pdf') as pdf: 
    for column in data:
     fig = boxplot(column)
     pdf.savefig(fig)
     plt.close()


In [None]:
# Remove outliers
df1 = data[(data["avg_sentence_length"] < 5)]
df2= df1[(df1["arousal"] < 5.5)]
df3= df2[(df2["dominance"] < 5.5)]
df_filtered = df3[(df3["polarity"] > -0.75)]

In [None]:
df_filtered.shape

# Feature Engineering

In [None]:
df_filtered['noun_perc'] = df_filtered['noun_density']/df_filtered['avg_sentence_length']
df_filtered['verb_perc'] = df_filtered['verb_density']/df_filtered['avg_sentence_length']
df_filtered['adverb_perc'] = df_filtered['adverb_density']/df_filtered['avg_sentence_length']
df_filtered['adjective_perc'] = df_filtered['adjective_density']/df_filtered['avg_sentence_length']
df_filtered['preposition_perc'] = df_filtered['preposition_density']/df_filtered['avg_sentence_length']
df_filtered['conjunction_perc'] = df_filtered['conjunction_density']/df_filtered['avg_sentence_length']

In [None]:
df_filtered.head()

# Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = df_filtered
# create scaler
scaler = MinMaxScaler(feature_range=(0,1))
# fit scaler on data
scaler.fit(df)
# apply transform
normalised = scaler.transform(df)

df_scaled = pd.DataFrame(normalised, columns=df.columns,index=df.index)

In [None]:
df_scaled.head()

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#concat two dataframes for better visualization 


In [None]:
X=df_scaled.drop('target',axis=1)
y=df_scaled['target']

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Featured','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
X = X.drop(["conjunction_density", "binned_interjection_density", "title_words"], axis=1)

In [None]:
X_columns = X.columns.values

# Modeling

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn import tree

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
# Hyperparameter Tuning
dt = DecisionTreeClassifier(random_state=42)

params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"],
    'splitter': ["best", "random"]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_
dt_best = grid_search.best_estimator_

In [None]:
y_pred_dt = dt_best.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt_best, 
                   feature_names=X_columns,  
                   filled=True)

In [111]:
lr = LogisticRegression(random_state=42)

params_lr = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'class_weight' : ['balanced'],
    'max_iter' : [100, 500, 1000]
    }
    
# Instantiate the grid search model
grid_search_lr = GridSearchCV(estimator = lr, 
                                param_grid = params_lr, 
                                cv = 4, 
                                verbose=True, 
                                n_jobs=-1, scoring = "accuracy")

grid_search_lr.fit(X_train, y_train)

In [None]:
grid_search_lr.best_estimator_
lr_best = grid_search.best_estimator_

In [None]:
y_pred_lr = lr_best.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))