In [3]:
import pandas as pd

# Assuming retractions35215.csv is in the current directory
df = pd.read_csv("retractions35215.csv")
df.head(3)

Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,...,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,CitationCount
0,50792,A fractional order nonlinear model of the love...,(B/T) Data Science;(PHY) Mathematics;,"Department of Mathematical Sciences, College o...",Scientific Reports,Springer - Nature Publishing Group,United Arab Emirates,Zulqurnain Sabir;Salem Ben Said,,Research Article;,...,10.1038/s41598-024-51277-3,38191570.0,3/04/2023,10.1038/s41598-023-32497-5,37012356.0,Retraction,+Duplication of Article;+Euphemisms for Duplic...,No,See also: https://pubpeer.com/publications/E4F...,5
1,50782,Investigation of automotive digital mirrors er...,(PHY) Engineering - Mechanical;,"Department of Physics, Faculty of Science, Cai...",Journal of Optics (India),Springer,Egypt,H S Ayoub;Wessam M Hussein;Y H Elbashar,,Research Article;,...,10.1007/s12596-023-01630-y,0.0,12/01/2021,10.1007/s12596-021-00677-z,0.0,Retraction,+Fake Peer Review;+Investigation by Journal/Pu...,No,,2
2,50781,Optical spectroscopic analysis of bandpass fil...,(PHY) Chemistry;(PHY) Crystallography/Spectros...,"Egypt Nanotechnology Center ((EGNC)), Cairo Un...",Journal of Optics (India),Springer,Egypt,Y H Elbashar;M A Mohamed;D Rayan;A M Badr;H A ...,,Research Article;,...,10.1007/s12596-023-01628-6,0.0,5/05/2020,10.1007/s12596-020-00611-9,0.0,Retraction,+Concerns/Issues with Peer Review;+Fake Peer R...,No,,14


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Load the dataset
retraction = pd.read_csv("retractions35215.csv")

# Keep only the required columns
retractions = retraction[['CitationCount', 'OriginalPaperDate', 'ArticleType', 'RetractionDate']]

# Handling missing values (Dropping Null Rows)
retractions = retractions.dropna()

# Feature Engineering
retractions['Publication_Year'] = pd.to_datetime(retractions['OriginalPaperDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Year'] = pd.to_datetime(retractions['RetractionDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Lag'] = (retractions['Retraction_Year'] - retractions['Publication_Year']) / 365.25

# Encode categorical features
le = LabelEncoder()
retractions['ArticleType'] = le.fit_transform(retractions['ArticleType'])

# Select features and target variable
X = retractions[['CitationCount', 'Publication_Year', 'ArticleType']]
y = retractions['Retraction_Lag']

# Splitting data into 50% training and 50% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Support Vector Regression with linear kernel and standardization via pipeline
svr_linear = make_pipeline(StandardScaler(), SVR(kernel='linear'))
svr_linear.fit(X_train, y_train)
mse_svr_linear = mean_squared_error(y_test, svr_linear.predict(X_test))

# Support Vector Regression with RBF kernel and standardization via pipeline
svr_rbf = make_pipeline(StandardScaler(), SVR(kernel='rbf'))
svr_rbf.fit(X_train, y_train)
mse_svr_rbf = mean_squared_error(y_test, svr_rbf.predict(X_test))

# Hyperparameter tuning for SVR with RBF kernel
param_grid = {
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': [1, 0.1, 0.01, 0.001]
}
grid = GridSearchCV(svr_rbf, param_grid, scoring='neg_mean_squared_error', refit=True, verbose=2)
grid.fit(X_train, y_train)
best_svr_rbf = grid.best_estimator_
mse_best_svr_rbf = mean_squared_error(y_test, best_svr_rbf.predict(X_test))

# Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)
mse_rf = mean_squared_error(y_test, rf_reg.predict(X_test))

# Compare mean squared errors
print("Mean Squared Error comparison:")
print("SVR with standardization (linear) --> %.3f" % mse_svr_linear)
print("SVR with standardization (RBF) --> %.3f" % mse_svr_rbf)
print("Tuned SVR with standardization (RBF) --> %.3f" % mse_best_svr_rbf)
print("Random Forest --> %.3f" % mse_rf)


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ...........................svr__C=0.1, svr__gamma=1; total time=   0.0s
[CV] END ...........................svr__C=0.1, svr__gamma=1; total time=   0.0s
[CV] END ...........................svr__C=0.1, svr__gamma=1; total time=   0.0s
[CV] END ...........................svr__C=0.1, svr__gamma=1; total time=   0.0s
[CV] END ...........................svr__C=0.1, svr__gamma=1; total time=   0.0s
[CV] END .........................svr__C=0.1, svr__gamma=0.1; total time=   0.0s
[CV] END .........................svr__C=0.1, svr__gamma=0.1; total time=   0.0s
[CV] END .........................svr__C=0.1, svr__gamma=0.1; total time=   0.0s
[CV] END .........................svr__C=0.1, svr__gamma=0.1; total time=   0.0s
[CV] END .........................svr__C=0.1, svr__gamma=0.1; total time=   0.0s
[CV] END ........................svr__C=0.1, svr__gamma=0.01; total time=   0.0s
[CV] END ........................svr__C=0.1, svr

In [9]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load your dataset (replace 'your_dataset.csv' with your actual dataset)
retractions = pd.read_csv("retractions35215.csv")

# Feature Engineering
retractions['Publication_Year'] = pd.to_datetime(retractions['OriginalPaperDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Year'] = pd.to_datetime(retractions['RetractionDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Lag'] = (retractions['Retraction_Year'] - retractions['Publication_Year']) / 365.25

# Select features and target variable
features = ['CitationCount', 'Retraction_Lag', 'ArticleType']
retractions['ArticleType'] = retractions['ArticleType'].astype('category').cat.codes

# Use 'Reason' as the target variable and encode it
retractions['Reason'] = retractions['Reason'].astype('category').cat.codes
y = retractions['Reason']

# Filter for top n most frequent classes
n = 10
top_classes = y.value_counts().index[:n]
filtered_data = retractions[retractions['Reason'].isin(top_classes)]

X = filtered_data[features]
y = filtered_data['Reason']

# Ensure balanced splitting using StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Gaussian Naive Bayes
gnb = GaussianNB().fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
acc_gnb = accuracy_score(y_test, y_pred_gnb)

# SVM with no standardization (linear kernel)
svm_clf_linear = SVC(kernel='linear').fit(X_train, y_train)
y_pred_svm_linear = svm_clf_linear.predict(X_test)
acc_svm_linear = accuracy_score(y_test, y_pred_svm_linear)

# SVM with standardization via pipeline (linear kernel)
pipe_linear = make_pipeline(StandardScaler(), SVC(kernel='linear'))
pipe_linear.fit(X_train, y_train)
acc_svm_linear_std = pipe_linear.score(X_test, y_test)

# SVM with RBF kernel and no standardization
svm_clf_rbf = SVC(kernel='rbf').fit(X_train, y_train)
y_pred_svm_rbf = svm_clf_rbf.predict(X_test)
acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)

# SVM with standardization via pipeline (RBF kernel)
pipe_rbf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
pipe_rbf.fit(X_train, y_train)
acc_svm_rbf_std = pipe_rbf.score(X_test, y_test)

# Compare accuracy scores
print("Accuracy comparison:")
print("Gaussian Naive Bayes --> %.3f%%" % (acc_gnb * 100))
print("SVM (linear) --> %.3f%%" % (acc_svm_linear * 100))
print("SVM with standardization (linear) --> %.3f%%" % (acc_svm_linear_std * 100))
print("SVM (RBF) --> %.3f%%" % (acc_svm_rbf * 100))
print("SVM with standardization (RBF) --> %.3f%%" % (acc_svm_rbf_std * 100))


Accuracy comparison:
Gaussian Naive Bayes --> 20.815%
SVM (linear) --> 41.928%
SVM with standardization (linear) --> 45.968%
SVM (RBF) --> 42.627%
SVM with standardization (RBF) --> 47.577%


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("retractions35215.csv")

# Drop rows with missing values
df.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['ArticleType'] = label_encoder.fit_transform(df['ArticleType'])
df['RetractionNature'] = label_encoder.fit_transform(df['RetractionNature'])

# Define Features (X) and Target Variable (y)
features = ['CitationCount', 'ArticleType']
X = df[features]
y = df['RetractionNature']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [12]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load your dataset (replace 'your_dataset.csv' with your actual dataset)
retractions = pd.read_csv("retractions35215.csv")

# Feature Engineering
retractions['Publication_Year'] = pd.to_datetime(retractions['OriginalPaperDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Year'] = pd.to_datetime(retractions['RetractionDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Lag'] = (retractions['Retraction_Year'] - retractions['Publication_Year']) / 365.25

# Select features and target variable
features = ['CitationCount', 'Retraction_Lag', 'ArticleType']
retractions['ArticleType'] = retractions['ArticleType'].astype('category').cat.codes

# Use 'Reason' as the target variable and encode it
retractions['Reason'] = retractions['Reason'].astype('category').cat.codes
y = retractions['Reason']

# Filter for top n most frequent classes
n = 10
top_classes = y.value_counts().index[:n]
filtered_data = retractions[retractions['Reason'].isin(top_classes)]

X = filtered_data[features]
y = filtered_data['Reason']

# Ensure balanced splitting using StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Gaussian Naive Bayes
gnb = GaussianNB().fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
acc_gnb = accuracy_score(y_test, y_pred_gnb)

# SVM with no standardization (linear kernel)
svm_clf_linear = SVC(kernel='linear').fit(X_train, y_train)
y_pred_svm_linear = svm_clf_linear.predict(X_test)
acc_svm_linear = accuracy_score(y_test, y_pred_svm_linear)

# SVM with standardization via pipeline (linear kernel)
pipe_linear = make_pipeline(StandardScaler(), SVC(kernel='linear'))
pipe_linear.fit(X_train, y_train)
acc_svm_linear_std = pipe_linear.score(X_test, y_test)

# SVM with RBF kernel and no standardization
svm_clf_rbf = SVC(kernel='rbf').fit(X_train, y_train)
y_pred_svm_rbf = svm_clf_rbf.predict(X_test)
acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)

# SVM with standardization via pipeline (RBF kernel)
pipe_rbf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
pipe_rbf.fit(X_train, y_train)
acc_svm_rbf_std = pipe_rbf.score(X_test, y_test)

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Compare accuracy scores
print("Accuracy comparison:")
print("Gaussian Naive Bayes --> %.3f%%" % (acc_gnb * 100))
print("SVM (linear) --> %.3f%%" % (acc_svm_linear * 100))
print("SVM with standardization (linear) --> %.3f%%" % (acc_svm_linear_std * 100))
print("SVM (RBF) --> %.3f%%" % (acc_svm_rbf * 100))
print("SVM with standardization (RBF) --> %.3f%%" % (acc_svm_rbf_std * 100))
print("Random Forest --> %.3f%%" % (acc_rf * 100))


Accuracy comparison:
Gaussian Naive Bayes --> 20.815%
SVM (linear) --> 41.928%
SVM with standardization (linear) --> 45.968%
SVM (RBF) --> 42.627%
SVM with standardization (RBF) --> 47.577%
Random Forest --> 47.140%


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Load the dataset
retractions = pd.read_csv('retractions35215.csv')

# Drop unnecessary columns
retractions = retractions.drop(['Title', 'URLS', 'RetractionDOI', 'OriginalPaperDOI', 'Notes', 'Subject', 'Journal', 'Publisher'], axis=1)

# Handling missing values (Dropping Null Rows)
retractions = retractions.dropna()

# Feature Engineering
retractions['Publication_Year'] = pd.to_datetime(retractions['OriginalPaperDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Year'] = pd.to_datetime(retractions['RetractionDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Lag'] = (retractions['Retraction_Year'] - retractions['Publication_Year']) / 365.25

# Select features and target variable
features = ['CitationCount', 'Retraction_Lag', 'ArticleType']
retractions['ArticleType'] = retractions['ArticleType'].astype('category').cat.codes

# Use 'Reason' as the target variable and encode it
retractions['Reason'] = retractions['Reason'].astype('category').cat.codes
y = retractions['Reason']

# Filter for top n most frequent classes
n = 10
top_classes = y.value_counts().index[:n]
filtered_data = retractions[retractions['Reason'].isin(top_classes)]

X = filtered_data[features]
y = filtered_data['Reason']

# Ensure balanced splitting using StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Gaussian Naive Bayes
gnb = GaussianNB().fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
acc_gnb = accuracy_score(y_test, y_pred_gnb)

# SVM with no standardization (linear kernel)
svm_clf_linear = SVC(kernel='linear').fit(X_train, y_train)
y_pred_svm_linear = svm_clf_linear.predict(X_test)
acc_svm_linear = accuracy_score(y_test, y_pred_svm_linear)

# SVM with standardization via pipeline (linear kernel)
pipe_linear = make_pipeline(StandardScaler(), SVC(kernel='linear'))
pipe_linear.fit(X_train, y_train)
acc_svm_linear_std = pipe_linear.score(X_test, y_test)

# SVM with RBF kernel and no standardization
svm_clf_rbf = SVC(kernel='rbf').fit(X_train, y_train)
y_pred_svm_rbf = svm_clf_rbf.predict(X_test)
acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)

# SVM with standardization via pipeline (RBF kernel)
pipe_rbf = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
pipe_rbf.fit(X_train, y_train)
acc_svm_rbf_std = pipe_rbf.score(X_test, y_test)

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Compare accuracy scores
print("Accuracy comparison:")
print("Gaussian Naive Bayes --> %.3f%%" % (acc_gnb * 100))
print("SVM (linear) --> %.3f%%" % (acc_svm_linear * 100))
print("SVM with standardization (linear) --> %.3f%%" % (acc_svm_linear_std * 100))
print("SVM (RBF) --> %.3f%%" % (acc_svm_rbf * 100))
print("SVM with standardization (RBF) --> %.3f%%" % (acc_svm_rbf_std * 100))
print("Random Forest --> %.3f%%" % (acc_rf * 100))


Accuracy comparison:
Gaussian Naive Bayes --> 49.325%
SVM (linear) --> 45.159%
SVM with standardization (linear) --> 50.428%
SVM (RBF) --> 46.433%
SVM with standardization (RBF) --> 53.338%
Random Forest --> 53.205%


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error

# Load the dataset
retractions = pd.read_csv('retractions35215.csv')

# Drop unnecessary columns
retractions = retractions.drop(['Title', 'URLS', 'RetractionDOI', 'OriginalPaperDOI', 'Notes', 'Subject', 'Journal', 'Publisher'], axis=1)

# Handling missing values (Dropping Null Rows)
retractions = retractions.dropna()

# Feature Engineering
retractions['Publication_Year'] = pd.to_datetime(retractions['OriginalPaperDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Year'] = pd.to_datetime(retractions['RetractionDate'], format='%d/%m/%Y', dayfirst=True).dt.year
retractions['Retraction_Lag'] = (retractions['Retraction_Year'] - retractions['Publication_Year']) / 365.25

# Select features and target variable
features = ['CitationCount', 'Retraction_Lag', 'ArticleType']
retractions['ArticleType'] = retractions['ArticleType'].astype('category').cat.codes

# Use 'Reason' as the target variable and encode it
retractions['Reason'] = retractions['Reason'].astype('category').cat.codes
y = retractions['Reason']

# Filter for top n most frequent classes
n = 10
top_classes = y.value_counts().index[:n]
filtered_data = retractions[retractions['Reason'].isin(top_classes)]

X = filtered_data[features]
y = filtered_data['Reason']

# Ensure balanced splitting using StratifiedKFold
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

# Compute classification metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf, average='weighted')
recall = recall_score(y_test, y_pred_rf, average='weighted')
f1 = f1_score(y_test, y_pred_rf, average='weighted')

# Print classification metrics
print("---Classification Metrics---")
print("Accuracy: %.3f" % accuracy)
print("Precision: %.3f" % precision)
print("Recall: %.3f" % recall)
print("F1-score: %.3f" % f1)

# Random Forest Regressor 
rf_reg = RandomForestRegressor(n_estimators=100, random_state=0)
rf_reg.fit(X_train, y_train)
y_pred_reg = rf_reg.predict(X_test)

# Compute regression metrics
mse = mean_squared_error(y_test, y_pred_reg)
mae = mean_absolute_error(y_test, y_pred_reg)
rmse = np.sqrt(mse)

# Print regression metrics
print("\n---Regression Metrics---")
print("Mean Squared Error (MSE): %.3f" % mse)
print("Mean Absolute Error (MAE): %.3f" % mae)
print("Root Mean Squared Error (RMSE): %.3f" % rmse)


---Classification Metrics---
Accuracy: 0.532
Precision: 0.430
Recall: 0.532
F1-score: 0.446

---Regression Metrics---
Mean Squared Error (MSE): 4545874.215
Mean Absolute Error (MAE): 1786.251
Root Mean Squared Error (RMSE): 2132.106
