

## Library import
We import all the required Python libraries

In [2]:
# Data manipulation
import pandas as pd
import numpy as np
#import xgboost as xgb
#from catboost import Pool,CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# showing multiple outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from pandarallel import pandarallel

# Initialization
pandarallel.initialize(progress_bar=True)

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef

#import lazypredict
#from lazypredict.Supervised import LazyClassifier


INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
#!pip3 install pandarallel
#!pip3 list

## Local library import
We import all the required local libraries libraries

In [4]:
# Include local library paths
import sys
# sys.path.append('path/to/local/lib') # uncomment and fill to import local libraries

# Import local libraries

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.


# Data import
We retrieve all the required data for the analysis.

In [5]:
deceptive_opinion = pd.read_csv('deceptive-opinion.csv')
Yelp_review_sentiments = pd.read_excel('Yelp Labelled Review Dataset with Sentiments and Features.xlsx',engine='openpyxl')

# Data processing
Put here the core of the notebook. Feel free di further split this section into subsections.

In [6]:
Yelp_review_sentiments['deceptive'] = Yelp_review_sentiments['Spam(1) and Not Spam(0)'].map({1: 'deceptive', 0: 'truthful'})
Yelp_review_sentiments['text'] = Yelp_review_sentiments["Review"]

In [7]:
deceptive_opinion
Yelp_review_sentiments

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...
...,...,...,...,...,...
1595,deceptive,intercontinental,negative,MTurk,Problems started when I booked the InterContin...
1596,deceptive,amalfi,negative,MTurk,The Amalfi Hotel has a beautiful website and i...
1597,deceptive,intercontinental,negative,MTurk,The Intercontinental Chicago Magnificent Mile ...
1598,deceptive,palmer,negative,MTurk,"The Palmer House Hilton, while it looks good i..."


Unnamed: 0,User_id,Product_id,Rating,Date,Review,Spam(1) and Not Spam(0),Sentiment,Features,deceptive,text
0,923,0,3,2014-01-30,The food at snack is a selection of popular Gr...,1,Positive,"['appetizer tray', 'greek salad', 'main courses']",deceptive,The food at snack is a selection of popular Gr...
1,924,0,3,2011-05-05,This little place in Soho is wonderful. I had ...,1,Positive,"['little place', 'soho', 'lamb sandwich', 'soh...",deceptive,This little place in Soho is wonderful. I had ...
2,925,0,4,2011-12-30,ordered lunch for 15 from Snack last Friday. Ã...,1,Positive,"['snack', 'regular company lunch list']",deceptive,ordered lunch for 15 from Snack last Friday. Ã...
3,926,0,4,2012-10-04,This is a beautiful quaint little restaurant o...,1,Positive,"['beautiful quaint', 'pretty street', 'great p...",deceptive,This is a beautiful quaint little restaurant o...
4,927,0,4,2014-02-06,Snack is great place for a Ã‚Â casual sit down...,1,Positive,"['snack', 'great place', 'Ã¢ casual', 'cold wi...",deceptive,Snack is great place for a Ã‚Â casual sit down...
...,...,...,...,...,...,...,...,...,...,...
355205,161146,349,1,2012-10-04,The aircondition makes so much noise and its ...,0,Negative,[],truthful,The aircondition makes so much noise and its ...
355206,116424,349,1,2013-05-27,Even though the pictures show very clean room...,0,Negative,"['clean rooms', 'actual room', 'o clock']",truthful,Even though the pictures show very clean room...
355207,161147,349,2,2011-03-03,Backyard of the hotel is total mess shouldn t...,0,Negative,"['backyard', 'total mess shouldn t']",truthful,Backyard of the hotel is total mess shouldn t...
355208,97930,349,2,2014-07-29,You When I booked with your company on line y...,0,Negative,"['s room', 'villa suite theough', 'wife s 40th...",truthful,You When I booked with your company on line y...


In [8]:
#filter deceptive opinion dataset with just text and label
sub_deceptive_opinion = deceptive_opinion[["deceptive", "text"]]

In [9]:
#filter Yelp dataset with just text and label
sub_Yelp_review_sentiments = Yelp_review_sentiments[["deceptive", "text"]]

In [10]:

sub_Yelp_deceptive = sub_Yelp_review_sentiments[sub_Yelp_review_sentiments["deceptive"] == "deceptive"].reset_index(drop = True)
sub_Yelp_truthful = sub_Yelp_review_sentiments[sub_Yelp_review_sentiments["deceptive"] == "truthful"].reset_index(drop = True).iloc[:36133,:]

In [11]:
#sub dataset of Yelp with around 36100 observations - balance class
concat_data = pd.concat([sub_deceptive_opinion, sub_Yelp_deceptive,sub_Yelp_truthful],ignore_index=True)

sub_deceptive_opinion = deceptive_opinion[["deceptive", "text"]]
sub_Yelp_review_sentiments = Yelp_review_sentiments[["deceptive", "text"]]
concat_data = pd.concat([sub_deceptive_opinion, sub_Yelp_review_sentiments],ignore_index=True)

In [12]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import string

def clean_text(text):
    '''
    Make text lowercase, remove text in square brackets,remove links,remove special characters
    and remove words containing numbers.
    '''
    #print(text)
    if type(text) is str:
        text = text.lower()
        text = re.sub('\[.*?\]', '', text)
        text = re.sub("\\W"," ",text) # remove special chars
        text = re.sub('https?://\S+|www\.\S+', '', text)
        text = re.sub('<.*?>+', '', text)
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub('\n', '', text)
        text = re.sub('\w*\d\w*', '', text)
    else:
        text = str(text)
    
    return text
    

In [13]:
import string
#nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [14]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [15]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = remove_stopwords(text)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [16]:
concat_data['cleaned_text']=concat_data['text'].parallel_apply(preprocess_data)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3078), Label(value='0 / 3078'))), …

In [17]:
concat_data

Unnamed: 0,deceptive,text,cleaned_text
0,truthful,We stayed for a one night getaway with family ...,stay one night getaway famili thursday tripl a...
1,truthful,Triple A rate with upgrade to view room was le...,tripl rate upgrad view room less also includ b...
2,truthful,This comes a little late as I'm finally catchi...,come littl late final catch review past sever ...
3,truthful,The Omni Chicago really delivers on all fronts...,omni chicago realli deliv front spacious room ...
4,truthful,I asked for a high floor away from the elevato...,ask high floor away elev got room pleasant dec...
...,...,...,...
73861,truthful,Excellent food and awesome 5 star service from...,excel food awesom star servic moment walk leav...
73862,truthful,YUMMY. Although I haven't had ramen from many ...,yummi although ramen mani differ place probabl...
73863,truthful,The soup is flavorful and deeply taste behind ...,soup flavor deepli tast behind love system ser...
73864,truthful,It was a snowy midweek afternoon that we decid...,snowi midweek afternoon decid drop bowl ramen ...


In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

x=concat_data['cleaned_text']
y=concat_data['deceptive']
le=LabelEncoder()
y=le.fit_transform(y)



In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=.10,random_state =23)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(stop_words='english', ngram_range=(1,2),max_features=60000) #60000 reached 79.6
X_train=cv.fit_transform(X_train)
X_test=cv.transform(X_test)

In [21]:
Y_train.shape
X_train.shape
X_test.shape

(66479,)

(66479, 60000)

(7387, 60000)

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train)
tfidf_transformer.transform(X_train)

TfidfTransformer()

<66479x60000 sparse matrix of type '<class 'numpy.float64'>'
	with 3759789 stored elements in Compressed Sparse Row format>

In [24]:
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

**Comment:** the hyperparameters have been used in the XGBClassifier below are optimized by using RandomizedSearchCV 

In [25]:
xgb_clf = XGBClassifier(random_state=24,reg_lambda = 1.2, reg_alpha = 1.1, n_estimators = 400,max_depth = 20, colsample_bytree = 0.7)
xgb_clf.fit(X_train, Y_train)
# Do the prediction
y_predict =xgb_clf.predict(X_test)
print(confusion_matrix(Y_test,y_predict))
recall=recall_score(Y_test,y_predict,average='macro')
precision=precision_score(Y_test,y_predict,average='macro')
f1score=f1_score(Y_test,y_predict,average='macro')
accuracy=accuracy_score(Y_test,y_predict)
#atthews = matthews_corrcoef(y_test,y_predict) 
print('Accuracy: '+ str(accuracy))
print('Macro Precision: '+ str(precision))
print('Macro Recall: '+ str(recall))
print('Macro F1 score:'+ str(f1score))
#print('MCC:'+ str(matthews))





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=24, num_parallel_tree=1, random_state=24,
              reg_alpha=1.1, reg_lambda=1.2, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

[[2998  724]
 [ 800 2865]]
Accuracy: 0.7936916204142412
Macro Precision: 0.7938176609208025
Macro Recall: 0.7935999436996788
Macro F1 score:0.793624720623691


# Extra code for References

In [87]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef

def model_testing(x_test, y_test, y_train, x_train):
    models = [
        #LogisticRegression(max_iter = 10000),
        RandomForestClassifier(random_state=24),
        XGBClassifier(random_state=24,reg_lambda = 1.2, reg_alpha = 1.1, n_estimators = 400,max_depth = 20, colsample_bytree = 0.7),
        AdaBoostClassifier( DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5),
        #KNeighborsClassifier(),
        #LGBMClassifier(random_state=24)
        #SVC(),
        #MultinomialNB(),
        #CatBoostClassifier(iterations=50, task_type="GPU",boosting_type = Plain,learning_rate=0.05, l2_leaf_reg=1, depth=5, loss_function= 'Logloss', eval_metric='AUC',random_seed=42,verbose=False)
        ]

    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        print(model_name)
        # Do the prediction
        y_predict =clf.predict(X_test)
        print(confusion_matrix(y_test,y_predict))
        recall=recall_score(y_test,y_predict,average='macro')
        precision=precision_score(y_test,y_predict,average='macro')
        f1score=f1_score(y_test,y_predict,average='macro')
        accuracy=accuracy_score(y_test,y_predict)
        matthews = matthews_corrcoef(y_test,y_predict) 
        print('Accuracy: '+ str(accuracy))
        print('Macro Precision: '+ str(precision))
        print('Macro Recall: '+ str(recall))
        print('Macro F1 score:'+ str(f1score))
        print('MCC:'+ str(matthews))

In [None]:
model_testing(X_test, Y_test, Y_train, X_train)

RandomForestClassifier
[[3456 2122]
 [1720 3782]]
Accuracy: 0.6532490974729241
Macro Precision: 0.6541398595979777
Macro Recall: 0.6534816571150697
Macro F1 score:0.6529486632509846
MCC:0.30762081255065826
XGBClassifier
[[3562 2016]
 [1926 3576]]
Accuracy: 0.6442238267148015
Macro Precision: 0.6442687283374418
Macro Recall: 0.6442628053112536
Macro F1 score:0.6442232587072579
MCC:0.28853153358790073


In [32]:
from sklearn.ensemble import VotingClassifier


ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)
xgb_clf = XGBClassifier(random_state=24)
rnd_clf = RandomForestClassifier(random_state=24)
voting_clf = VotingClassifier(
estimators=[('xgb', xgb_clf), ('ada', ada_clf)],voting='hard')
voting_clf.fit(X_train, Y_train)


VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=24, reg_alpha=None,
   

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data,
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       scoring_test=r2_score, do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        n_jobs=-1,
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)

# Defining our estimator, the algorithm to optimize
models_to_train = [XGBClassifier(random_state=24)]

# Defining the hyperparameters to optimize
grid_parameters = [
    { # XGBoost
        'n_estimators': [400, 700, 1000],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [15,20,25],
        'reg_alpha': [1.1, 1.2, 1.3],
        'reg_lambda': [1.1, 1.2, 1.3]}]
        
models_preds_scores = []

for i, model in enumerate(models_to_train):
    params = grid_parameters[i]

    result = algorithm_pipeline(X_train, X_test, Y_train, Y_test,
                                 model, params, cv=5)
    models_preds_scores.append(result)

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from tensorflow import keras

xgb_clf = XGBClassifier(random_state=24)

param_distribs = [
    { # XGBoost
        'n_estimators': [400, 700, 1000],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [15,20,25],
        'reg_alpha': [1.1, 1.2, 1.3],
        'reg_lambda': [1.1, 1.2, 1.3]}]

rnd_search_cv = RandomizedSearchCV(xgb_clf , param_distribs, n_iter=10, cv=3)

rnd_search_cv.fit(X_train, Y_train)

RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=24, reg_alpha=None,
                            

In [46]:
rnd_search_cv.best_params_

{'reg_lambda': 1.2,
 'reg_alpha': 1.1,
 'n_estimators': 400,
 'max_depth': 20,
 'colsample_bytree': 0.7}

{'reg_lambda': 1.2,
 'reg_alpha': 1.1,
 'n_estimators': 400,
 'max_depth': 20,
 'colsample_bytree': 0.7}

In [None]:
model = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, n_estimators=300, device = "gpu")

LGBMC = model.fit(X_train, Y_train)
print(model_name)
# Do the prediction
y_predict =LGBMC.predict(X_test)
print(confusion_matrix(Y_test,y_predict))
recall=recall_score(y_test,y_predict,average='macro')
precision=precision_score(y_test,y_predict,average='macro')
f1score=f1_score(y_test,y_predict,average='macro')
accuracy=accuracy_score(y_test,y_predict)
matthews = matthews_corrcoef(y_test,y_predict) 
print('Accuracy: '+ str(accuracy))
print('Macro Precision: '+ str(precision))
print('Macro Recall: '+ str(recall))
print('Macro F1 score:'+ str(f1score))
print('MCC:'+ str(matthews))

In [41]:
X_train

<59092x1027360 sparse matrix of type '<class 'numpy.int64'>'
	with 9101268 stored elements in Compressed Sparse Row format>

In [36]:
# fit and train the model 
#start_time_1=time.time()
models_c,predictions_c = classi.fit(X_train, X_test, Y_train, Y_test)
#end_time_1=time.time()

AttributeError: select_dtypes not found

In [2]:
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer()
X = data.data
y= data.target

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [00:00<00:00, 29.66it/s]


In [5]:
type(X_train)

numpy.ndarray

In [3]:
from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)

offset = int(X.shape[0] * 0.9)

X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████| 42/42 [00:02<00:00, 16.58it/s]

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
SVR                                          0.83       0.88  2.62        0.02
BaggingRegressor                             0.83       0.88  2.63        0.04
NuSVR                                        0.82       0.86  2.76        0.02
RandomForestRegressor                        0.81       0.86  2.78        0.26
XGBRegressor                                 0.81       0.86  2.79        0.14
GradientBoostingRegressor                    0.81       0.86  2.84        0.13
ExtraTreesRegressor                          0.79       0.84  2.98        0.15
AdaBoostRegressor                            0.78       0.83  3.04        0.08
HistGradientBoostingRegressor                0.77       0.83  3.06        0.29
PoissonRegressor                             0.77       0.83  3.11        0.02
LGBMRegressor                                0.77   




In [3]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearSVC,0.99,0.99,0.99,0.99,0.03
Perceptron,0.99,0.98,0.98,0.99,0.01
LogisticRegression,0.99,0.98,0.98,0.99,0.03
SVC,0.98,0.98,0.98,0.98,0.02
XGBClassifier,0.98,0.98,0.98,0.98,0.07
LabelPropagation,0.98,0.97,0.97,0.98,0.02
LabelSpreading,0.98,0.97,0.97,0.98,0.02
BaggingClassifier,0.97,0.97,0.97,0.97,0.04
PassiveAggressiveClassifier,0.98,0.97,0.97,0.98,0.01
SGDClassifier,0.98,0.97,0.97,0.98,0.01


**Comment:** 1 means truthful, 0 means deceptive

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.025, random_state=42)

for train_index, test_index in split.split(x, y):
    strat_train_set = concat_data.loc[train_index]
    strat_test_set = concat_data.loc[test_index]

for train_index, validation_index in split.split(strat_train_set["text"], strat_train_set["deceptive"]):
    strat_train_set = concat_data.loc[train_index]
    strat_validation_set = concat_data.loc[validation_index]


In [12]:
strat_train_set.shape
strat_validation_set.shape
strat_test_set.shape

(339191, 2)

(8698, 2)

(8921, 2)

In [13]:
strat_train_set.head()

Unnamed: 0,deceptive,text
301906,truthful,cell like rooms did not appeal to us difficul...
238848,truthful,the red curry sunfish banh mi is a nearly perf...
96414,truthful,this new place is pretty popular i really lik...
217351,truthful,heard about this place dropped in once withou...
101243,truthful,star falafel


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))
x_train=cv.fit_transform(strat_train_set["text"])
y_train=le.fit_transform(strat_train_set["deceptive"])
x_valid=cv.fit_transform(strat_validation_set["text"])
y_valid=le.fit_transform(strat_validation_set["deceptive"])
x_test=cv.fit_transform(strat_test_set["text"])
y_test=le.fit_transform(strat_test_set["deceptive"])

In [15]:
#calculating the weight of each class
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(strat_train_set["deceptive"]), strat_train_set["deceptive"])
class_weights 

238848    truthful
96414     truthful
217351    truthful
101243    truthful
            ...   
156418    truthful
319286    truthful
277541    truthful
309460    truthful
131382    truthful
Name: deceptive, Length: 339191, dtype: object as keyword args. From version 0.25 passing these as positional arguments will result in an error


array([4.79394805, 0.55822148])

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef

def model_testing(x_test, y_test, y_train, x_train):
    models = [
        LogisticRegression(max_iter = 10000),
        SVC(),
        MultinomialNB()
        #CatBoostClassifier(iterations=100, task_type="GPU", learning_rate=0.05, l2_leaf_reg=1, depth=11, loss_function= 'Logloss', eval_metric='AUC',random_seed=42, class_weights=[4.79, 0.56], verbose=False)
        ]

    for clf in models:
        model_name = clf.__class__.__name__
        clf.fit(x_train, y_train)
        print(model_name)
        # Do the prediction
        y_predict =clf.predict(cv.transform(x_test))
        print(confusion_matrix(y_test,y_predict))
        recall=recall_score(y_test,y_predict,average='macro')
        precision=precision_score(y_test,y_predict,average='macro')
        f1score=f1_score(y_test,y_predict,average='macro')
        accuracy=accuracy_score(y_test,y_predict)
        matthews = matthews_corrcoef(y_test,y_predict) 
        print('Accuracy: '+ str(accuracy))
        print('Macro Precision: '+ str(precision))
        print('Macro Recall: '+ str(recall))
        print('Macro F1 score:'+ str(f1score))
        print('MCC:'+ str(matthews))

In [16]:
#build the model with hyperparameters
model = CatBoostClassifier(iterations=100, task_type="GPU", learning_rate=0.05, l2_leaf_reg=1, depth=11, loss_function= 'Logloss', eval_metric='AUC',random_seed=42, verbose=False)

In [None]:
model.fit(x_train,y_train,
          eval_set=(x_valid, y_valid),plot =True)


# References
We report here relevant references:
1. author1, article1, journal1, year1, url1
2. author2, article2, journal2, year2, url2