In [16]:
import pandas as pd
import numpy as np

We are interested in how to predict or find a "good" journal, indicated by the impact factor (IF), with various criteria.
In this case we perticularlly want to know if the name containing certain keywords makes a difference among all other criteria.
Although IF is calculated by the ariticle citation, it has its time limit and can undermise good journal that talks about less hot fields.
Hence we do also want to see if a good journal remains good, or not.

In [21]:

# Read the dataset into a DataFrame
df = pd.read_csv('cleaned_impact_factor_merged_file.csv')

# Clean 'Keyword_1' column
df['Keyword_1'] = df['Keyword_1'].astype(str)  # Convert to string
df['Keyword_1'] = df['Keyword_1'].str.strip()  # Remove leading and trailing whitespaces
df['Keyword_1'] = df['Keyword_1'].replace('', np.nan)  # Replace empty strings with NaN

# Clean 'Keyword_2' column
df['Keyword_2'] = df['Keyword_2'].astype(str)  # Convert to string
df['Keyword_2'] = df['Keyword_2'].str.strip()  # Remove leading and trailing whitespaces
df['Keyword_2'] = df['Keyword_2'].replace('', np.nan)  # Replace empty strings with NaN

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   jornal_url       2072 non-null   object 
 1   journal          2072 non-null   object 
 2   article_number   2072 non-null   float64
 3   citation_number  2072 non-null   float64
 4   impact_factor    2072 non-null   float64
 5   most_cited       2072 non-null   object 
 6   year             2072 non-null   float64
 7   cited_by         2072 non-null   float64
 8   Search_Keywords  2072 non-null   object 
 9   Keyword_1        2072 non-null   object 
 10  Keyword_2        2072 non-null   object 
dtypes: float64(5), object(6)
memory usage: 178.2+ KB


Converting the keyword information into categorical values that we really is interested about.

In [22]:
df['topic_in_journal'] = df['journal'].str.contains('|'.join(df['Keyword_1']), case=False)
df['specifier_in_journal'] = df['journal'].str.contains('|'.join(df['Keyword_2']), case=False)

df['topic_in_most_cited'] = df['most_cited'].str.contains('|'.join(df['Keyword_1']), case=False)
df['specifier_in_most_cited'] = df['most_cited'].str.contains('|'.join(df['Keyword_1']), case=False)


In [23]:
df_bk = df.copy()
df_bk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   jornal_url               2072 non-null   object 
 1   journal                  2072 non-null   object 
 2   article_number           2072 non-null   float64
 3   citation_number          2072 non-null   float64
 4   impact_factor            2072 non-null   float64
 5   most_cited               2072 non-null   object 
 6   year                     2072 non-null   float64
 7   cited_by                 2072 non-null   float64
 8   Search_Keywords          2072 non-null   object 
 9   Keyword_1                2072 non-null   object 
 10  Keyword_2                2072 non-null   object 
 11  topic_in_journal         2072 non-null   bool   
 12  specifier_in_journal     2072 non-null   bool   
 13  topic_in_most_cited      2072 non-null   bool   
 14  specifier_in_most_cited 

By converting the search keywords, what we really want to know is if the keyword appears in the journal name and in its mosts cited article.
The keyword 1 is the topic we are interested in, in this case, cancer. And the second is for the more specific field, nano for example. 

In [34]:
# Select the categorical columns to create dummy variables
categorical_columns = ['topic_in_journal', 'specifier_in_journal', 'topic_in_most_cited', 'specifier_in_most_cited']

# Create dummy variables for the categorical columns
dummy_df = pd.get_dummies(df, columns=categorical_columns)

# Print the dummy dataset
dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   jornal_url                     2072 non-null   object 
 1   journal                        2072 non-null   object 
 2   article_number                 2072 non-null   float64
 3   citation_number                2072 non-null   float64
 4   impact_factor                  2072 non-null   float64
 5   most_cited                     2072 non-null   object 
 6   year                           2072 non-null   float64
 7   cited_by                       2072 non-null   float64
 8   Search_Keywords                2072 non-null   object 
 9   Keyword_1                      2072 non-null   object 
 10  Keyword_2                      2072 non-null   object 
 11  topic_in_journal_True          2072 non-null   uint8  
 12  specifier_in_journal_False     2072 non-null   u

Since all journal has the tpoic (topic in journal only has true), we will drop it among other names.

In [35]:
# Drop object columns from the DataFrame
df_dropped = dummy_df.drop(['topic_in_journal_True'], axis=1)
df_dropped = df_dropped.drop(df_dropped.select_dtypes(include=['object']), axis=1)
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   article_number                 2072 non-null   float64
 1   citation_number                2072 non-null   float64
 2   impact_factor                  2072 non-null   float64
 3   year                           2072 non-null   float64
 4   cited_by                       2072 non-null   float64
 5   specifier_in_journal_False     2072 non-null   uint8  
 6   specifier_in_journal_True      2072 non-null   uint8  
 7   topic_in_most_cited_False      2072 non-null   uint8  
 8   topic_in_most_cited_True       2072 non-null   uint8  
 9   specifier_in_most_cited_False  2072 non-null   uint8  
 10  specifier_in_most_cited_True   2072 non-null   uint8  
dtypes: float64(5), uint8(6)
memory usage: 93.2 KB


We want to fit for impact factor, hence target is impact factor

In [36]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(df_dropped.drop('impact_factor', axis=1), df_dropped['impact_factor'], test_size=0.2, random_state=42)

# Print the shapes of the resulting subsets
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (1657, 10)
Testing set shape: (415, 10)


Scale the dataset basing on training set, and this scaler will be applied to testing set

In [37]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the training set and scale it
X_train_scaled = scaler.fit_transform(X_train)

# Scale the testing set using the fitted scaler
X_test_scaled = scaler.transform(X_test)

In [39]:
import pickle

In [41]:
master_list={'df' : df_dropped,
             'X_train' : X_train,
             'X_test' : X_test,
             'y_train': y_train,
             'y_test': y_test,
             'scaler' : scaler,
             'X_train_scaled': X_train_scaled,
             'X_test_scaled': X_test_scaled
            }

Save data for later use

In [42]:
with open('preprocessed_datamaster_list.pkl', 'wb') as file:
    pickle.dump(master_list, file)