In [2]:
import pandas as pd
import os

# Path relative to the script's directory
script_dir = os.getcwd()
print (script_dir)
path = os.path.join(script_dir, "../results/data_formatted_subjectAreas.csv")

df = pd.read_csv(path)

df.dropna(subset=['Title', 'Abstract', 'Author Keywords', 'Publication Name'], inplace=True)

c:\Users\PangSunatcha\OneDrive - Chulalongkorn University\Documents\Y2S1 files\Data Sci\proj\Data-Sci-project\src


In [85]:
print(df.columns)
df.shape

Index(['Title', 'Cover Date', 'Aggregation Type', 'Authors', 'Subject Areas', 'Author Keywords', 'Abstract', 'Reference Count', 'Publication Name', 'Year', 'Processed Words', 'subject_area_encoded'], dtype='object')


(16289, 12)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

# Encode the target variable
label_encoder = LabelEncoder()
df['subject_area_encoded'] = label_encoder.fit_transform(df['Subject Areas'])

# Split data into features and target
X = df[['Title', 'Abstract', 'Author Keywords', 'Publication Name']]
y = df['subject_area_encoded']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define vectorizers for each column
title_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', max_df=0.2)
abstract_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', max_df=0.2)
keywords_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', max_df=0.2)
publication_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.2)

# Fit and transform each column separately for the training set
X_train_title = title_vectorizer.fit_transform(X_train['Title'])
X_train_abstract = abstract_vectorizer.fit_transform(X_train['Abstract'])
X_train_keywords = keywords_vectorizer.fit_transform(X_train['Author Keywords'])
X_train_publication = publication_vectorizer.fit_transform(X_train['Publication Name'])

# Transform the test set using the same vectorizers
X_test_title = title_vectorizer.transform(X_test['Title'])
X_test_abstract = abstract_vectorizer.transform(X_test['Abstract'])
X_test_keywords = keywords_vectorizer.transform(X_test['Author Keywords'])
X_test_publication = publication_vectorizer.transform(X_test['Publication Name'])

# Combine the transformed columns using hstack
X_train_combined = hstack([X_train_title, X_train_abstract, X_train_keywords, X_train_publication])
X_test_combined = hstack([X_test_title, X_test_abstract, X_test_keywords, X_test_publication])

# Define and train the classifier
classifier = LogisticRegression(C=100, max_iter=500, penalty='l2', solver='saga', random_state=50)
classifier.fit(X_train_combined, y_train)

# Make predictions
y_pred = classifier.predict(X_test_combined)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8710865561694291

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       222
           1       0.85      0.71      0.77        24
           2       0.82      0.75      0.78       211
           3       0.81      0.84      0.83        68
           4       0.92      0.85      0.89        96
           5       0.91      0.88      0.90       165
           6       0.88      0.88      0.88       214
           7       1.00      0.33      0.50         6
           8       1.00      0.92      0.96        61
           9       0.87      0.82      0.85        50
          10       0.72      0.62      0.67        29
          11       0.85      0.92      0.88       114
          12       0.84      0.84      0.84       217
          13       0.84      0.86      0.85       145
          14       0.81      0.65      0.72        26
          15       0.92      0.80      0.85        84
          16       0.85    

In [4]:
scopus_path = os.path.join(script_dir, "../results/sscopus_api.csv")
scopusdf = pd.read_csv(scopus_path)
scopusdf.shape

(1100, 12)

In [5]:
scopus = scopusdf[scopusdf.abstract != '[No abstract available]'] 
scopus = scopus.dropna(subset='abstract')
print(scopus.columns,"\n",scopus.shape)

Index(['title', 'author', 'publicationName', 'cover_date', 'scopus_id',
       'cited_by_count', 'open_access', 'eid', 'aggregationType',
       'affiliations', 'link', 'abstract'],
      dtype='object') 
 (748, 12)


In [73]:
scopus['author_keywords'] = ''

# Preprocess the required columns from the scopus DataFrame
scopus_transformed_title = title_vectorizer.transform(scopus['title'].fillna(''))
scopus_transformed_abstract = abstract_vectorizer.transform(scopus['abstract'].fillna(''))
scopus_transformed_keywords = keywords_vectorizer.transform(scopus['author_keywords'].fillna('')) 
scopus_transformed_publication = publication_vectorizer.transform(scopus['publicationName'].fillna(''))

# Combine the features using hstack
scopus_combined_features = hstack([
    scopus_transformed_title,
    scopus_transformed_abstract,
    scopus_transformed_keywords,
    scopus_transformed_publication
])

# Use the trained model to predict subject areas
scopus_predictions = classifier.predict(scopus_combined_features)

# Decode the predicted labels to the original subject area names
scopus['predicted_subject_area'] = label_encoder.inverse_transform(scopus_predictions)

# Display the results
scopus[['title', 'predicted_subject_area']].head(10)


Unnamed: 0,title,predicted_subject_area
0,Analysing trends of computational urban science and data science approaches for sustainable development,Computer Science
1,A landmark federal interagency collaboration to promote data science in health care: Million Veteran Program-Computational Health Analytics for Medical Precision to Improve Outcomes Now,Social Sciences
2,Regional planning: A failed or flawed project for Africa? Taking advantage of big data science on the horizon,Health Professions
3,Data Science and Model Predictive Control:: A survey of recent advances on data-driven MPC algorithms,Biochemistry
4,Assessment of the relationship between central venous pressure waveform and the severity of tricuspid valve regurgitation using data science,Medicine
5,Data science basis and influencing factors for the evaluation of environmental safety perception in Macau parishes,Engineering
6,2D magnetotelluric imaging method based on visionary self-attention mechanism and data science,Mathematics
7,Data science in sustainable entrepreneurship: A multidisciplinary field of applications,Social Sciences
8,A data science framework for profit health assessment: development and validation,Engineering
9,Community-Engaged Data Science (CEDS): A Case Study of Working with Communities to Use Data to Inform Change,Social Sciences


In [74]:
scopus.to_csv(os.path.join(script_dir, "../results/predicted_scopus.csv"))

In [6]:
arxiv_path = os.path.join(script_dir, "../results/arxiv_data.csv")
arxiv = pd.read_csv(arxiv_path)
print(arxiv.columns,"\n",arxiv.shape)

Index(['ID', 'Title', 'Abstract', 'Authors', 'Published Date', 'Updated Date',
       'Comments', 'Primary Category', 'PDF Link', 'Language'],
      dtype='object') 
 (300, 10)


In [10]:
arxiv['author_keywords'] = ''
arxiv["publicationName"] = ''

# Preprocess the required columns from the scopus DataFrame
scopus_transformed_title = title_vectorizer.transform(arxiv['Title'].fillna(''))
scopus_transformed_abstract = abstract_vectorizer.transform(arxiv['Abstract'].fillna(''))
scopus_transformed_keywords = keywords_vectorizer.transform(arxiv['author_keywords'].fillna('')) 
scopus_transformed_publication = publication_vectorizer.transform(arxiv['publicationName'].fillna(''))

# Combine the features using hstack
arxiv_combined_features = hstack([
    scopus_transformed_title,
    scopus_transformed_abstract,
    scopus_transformed_keywords,
    scopus_transformed_publication
])

# Use the trained model to predict subject areas
arxiv_predictions = classifier.predict(arxiv_combined_features)

# Decode the predicted labels to the original subject area names
arxiv['predicted_subject_area'] = label_encoder.inverse_transform(arxiv_predictions)

# Display the results
arxiv[['Title', 'predicted_subject_area']]


Unnamed: 0,Title,predicted_subject_area
0,A framework for understanding data science,Computer Science
1,Defining Data Science,Physics and Astronomy
2,Data Science in Perspective,Computer Science
3,Data Science: A Comprehensive Overview,Physics and Astronomy
4,Ten Research Challenge Areas in Data Science,Medicine
...,...,...
295,A Survey on Semantics in Automated Data Science,Medicine
296,The cost of reading research. A study of Compu...,Medicine
297,Accuracy of citation data in Web of Science an...,Medicine
298,The KM3NeT Open Science System,Social Sciences


In [8]:
arxiv.to_csv(os.path.join(script_dir, "../results/predicted_arxiv.csv"))

## Below are previous trials of ml model that were not successful

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Encode the target variable
label_encoder = LabelEncoder()
df['subject_area_encoded'] = label_encoder.fit_transform(df['Subject Areas'])

# Split data into features and target
X = df[['Title', 'Abstract', 'Author Keywords', 'Publication Name']]
y = df['subject_area_encoded']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(max_features=10000,stop_words='english'), 'Title'),
        ('abstract_tfidf', TfidfVectorizer(max_features=10000,stop_words='english'), 'Abstract'),
        ('keywords_tfidf', TfidfVectorizer(max_features=10000,stop_words='english'), 'Author Keywords'),
        ('publication_tfidf', TfidfVectorizer(max_features=5000,stop_words='english'), 'Publication Name')
    ]
)

# Define the pipeline with a Logistic Regression classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=100, max_iter=500, penalty='l2', solver='saga', random_state=50))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8717004297114794

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       222
           1       0.85      0.71      0.77        24
           2       0.82      0.76      0.79       211
           3       0.81      0.84      0.83        68
           4       0.92      0.85      0.89        96
           5       0.90      0.88      0.89       165
           6       0.86      0.89      0.88       214
           7       1.00      0.33      0.50         6
           8       1.00      0.92      0.96        61
           9       0.88      0.84      0.86        50
          10       0.82      0.62      0.71        29
          11       0.86      0.92      0.89       114
          12       0.83      0.83      0.83       217
          13       0.84      0.85      0.84       145
          14       0.81      0.65      0.72        26
          15       0.92      0.79      0.85        84
          16       0.85    

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Target variable encoding
label_encoder = LabelEncoder()
df['subject_area_encoded'] = label_encoder.fit_transform(df['Subject Areas'])

# Vectorize textual columns
tfidf_title = TfidfVectorizer(max_features=500,stop_words='english').fit_transform(df['Title'])
tfidf_abstract = TfidfVectorizer(max_features=1000, stop_words='english').fit_transform(df['Abstract'])
cv_keywords = CountVectorizer(binary=True).fit_transform(df['Author Keywords'])
tfidf_pub_name = TfidfVectorizer(max_features=100,stop_words='english').fit_transform(df['Publication Name'])

# Combine features
X_combined = hstack([tfidf_title, tfidf_abstract, cv_keywords, tfidf_pub_name])
y = df['subject_area_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(C = 100, max_iter= 500, penalty= 'l2', solver = 'saga' ,random_state=50) 
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6347452424800492

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.58      0.59       222
           1       0.50      0.46      0.48        24
           2       0.43      0.39      0.41       211
           3       0.65      0.59      0.62        68
           4       0.53      0.43      0.47        96
           5       0.56      0.58      0.57       165
           6       0.76      0.79      0.77       214
           7       0.00      0.00      0.00         6
           8       0.76      0.62      0.68        61
           9       0.57      0.40      0.47        50
          10       0.58      0.38      0.46        29
          11       0.63      0.62      0.63       114
          12       0.56      0.65      0.60       217
          13       0.58      0.52      0.55       145
          14       0.46      0.23      0.31        26
          15       0.64      0.54      0.58        84
          16       0.59    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Split data into features and target
X = df[['Title', 'Abstract', 'Author Keywords', 'Publication Name']]
y = df['Subject Areas']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('title_tfidf', TfidfVectorizer(stop_words='english'), 'Title'),
        ('keywords_tfidf', TfidfVectorizer(stop_words='english'), 'Author Keywords'),
        ('abstract_tfidf', TfidfVectorizer(stop_words='english'), 'Abstract'),
        ('publication_tfidf', TfidfVectorizer(stop_words='english'), 'Publication Name')
    ]
)

# Define the pipeline with a classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Display predictions
print("Predicted Subject Areas:", y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Predicted Subject Areas: ['Medicine' 'Energy' 'Social Sciences' ...
 'Agricultural and Biological Sciences' 'Social Sciences' 'Biochemistry']
Accuracy: 0.6080417434008594

Classification Report:
                                       precision    recall  f1-score   support

Agricultural and Biological Sciences       0.61      0.54      0.57       222
                 Arts and Humanities       1.00      0.25      0.40        24
                        Biochemistry       0.59      0.27      0.38       211
                            Business       0.86      0.65      0.74        68
                Chemical Engineering       0.59      0.42      0.49        96
                           Chemistry       0.61      0.67      0.64       165
                    Computer Science       0.62      0.88      0.73       214
                   Decision Sciences       0.00      0.00      0.00         6
                           Dentistry       1.00      0.10      0.18        61
        Earth and Plane

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import ast

df['Processed Words'] = df['Processed Words'].apply(ast.literal_eval)
df['Author Keywords'] = df['Author Keywords'].fillna('')
df['combined keywords'] = df.apply(
    lambda row: row['Processed Words'] + row['Author Keywords'].split(', '),
    axis=1
)
df['combined keywords'].apply(lambda row: list(set(row)))

0        [fuel, cells, Bifunctional air electrode, Cata...
1        [benefit, performance, punishment, reinforceme...
2        [, biomarkers, gvhd, predict, acute, outcomes,...
3        [Epigenetics, endocrine, Gene expression, Endo...
4        [Probabilistic finite state machine, inference...
                               ...                        
20211    [metaanalysis, development, , chronic, systema...
20212    [infections, tertiary care hospital, child, tr...
20213    [Gas-generating agent, release, Turmeric extra...
20214    [, cancer, signature, identification, gut, nov...
20215    [cent, Tympanic membrane, acetic, granular, 1,...
Name: combined keywords, Length: 20216, dtype: object

In [41]:
import string
# Combine the relevant text columns into one column for each row
df['combined keywords'] = df['Title'] + " " + df['Author Keywords'] + " " + df['Abstract']

# Now split each row's combined text into words (tokens)
df['combined keywords'] = df['combined keywords'].apply(lambda x: x.split())

# Remove duplicates by converting the list to a set, then back to a list
df['combined keywords'] = df['combined keywords'].apply(lambda x: list(set(x)))

# Finally, join the words back into a single string for each row
df['combined keywords'] = df['combined keywords'].apply(lambda x: ' '.join(x))

# Optionally, reset index if necessary
df.reset_index(drop=True, inplace=True)
df["combined keywords"] = df["combined keywords"].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))
# Check the result
print(df[['combined keywords']].head())


                                   combined keywords
0  of URPEMFC due maximum mainly Irbased implemen...
1  of often threat performance reinforcement eith...
2  development of utero neurological discuss spec...
3  of Incremental incremental positive introduce ...
4  surfactant of prepare °C optimizing PIT medium...


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and labels
# TF-IDF feature extraction

tfidf = TfidfVectorizer(max_features=5000,stop_words='english')
tfidf_kw = tfidf.fit_transform(df['combined keywords'])

new_tfidf = TfidfVectorizer(max_features=5000,stop_words='english')
tfidf_pn = new_tfidf.fit_transform(df['Publication Name'])

from scipy.sparse import hstack
X = hstack([tfidf_kw, tfidf_pn])
y = df['Subject Areas']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)


# Train Logistic Regression model
logreg = LogisticRegression(C = 100, max_iter= 500, penalty= 'l2', solver = 'saga' ,random_state=50) 
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9057704112952731

Classification Report:
                                       precision    recall  f1-score   support

Agricultural and Biological Sciences       0.93      0.91      0.92       206
                 Arts and Humanities       0.90      0.68      0.78        28
                        Biochemistry       0.84      0.81      0.82       183
                            Business       0.83      0.85      0.84        62
                Chemical Engineering       0.92      0.93      0.92        84
                           Chemistry       0.93      0.91      0.92       169
                    Computer Science       0.85      0.93      0.89       204
                   Decision Sciences       0.00      0.00      0.00         4
                           Dentistry       0.96      0.96      0.96        55
        Earth and Planetary Sciences       0.88      0.93      0.90        45
            Econometrics and Finance       0.88      0.83      0.86        36
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Prepare features and labels
# TF-IDF feature extraction
df['combined keywords'] = df['combined keywords'].apply(lambda x: ' '.join(x))
tfidf = TfidfVectorizer(max_features=5000)
tfidf_kw = tfidf.fit_transform(df['combined keywords'])

new_tfidf = TfidfVectorizer(max_features=5000)
tfidf_pn = new_tfidf.fit_transform(df['Publication Name'])

from scipy.sparse import hstack
X = hstack([tfidf_kw, tfidf_pn])
y = df['Subject Areas']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)


# Train Logistic Regression model
logreg = LogisticRegression(C = 100, max_iter= 500, penalty= 'l2', solver = 'saga' ,random_state=50) 
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8936696340257171

Classification Report:
                                       precision    recall  f1-score   support

Agricultural and Biological Sciences       0.93      0.90      0.91       220
                 Arts and Humanities       0.97      0.88      0.92        33
                        Biochemistry       0.84      0.78      0.81       236
                            Business       0.82      0.85      0.83        65
                Chemical Engineering       0.94      0.82      0.88       102
                           Chemistry       0.87      0.96      0.92       260
                    Computer Science       0.87      0.90      0.88       232
                   Decision Sciences       1.00      0.20      0.33         5
                           Dentistry       0.97      0.90      0.93        70
        Earth and Planetary Sciences       0.88      0.79      0.84        58
            Econometrics and Finance       0.80      0.77      0.79        31
         

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load the dataset
# df = pd.read_csv("/path/to/data_formatted_subjectAreas.csv")
# df['Processed Words'] = df['Processed Words'].apply(eval)

# # Prepare features and labels
# X = df['Processed Words'].apply(lambda x: ' '.join(x))
# y = df['Subject Areas']

# # Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # TF-IDF feature extraction
# tfidf = TfidfVectorizer(max_features=5000)
# X_train_tfidf = tfidf.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test)

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [100, 500, 1000]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=LogisticRegression(random_state=50),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Train the model
grid_search.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.8919386745796242

Classification Report:
                                       precision    recall  f1-score   support

Agricultural and Biological Sciences       0.93      0.90      0.91       220
                 Arts and Humanities       0.97      0.88      0.92        33
                        Biochemistry       0.83      0.78      0.81       236
                            Business       0.82      0.85      0.83        65
                Chemical Engineering       0.93      0.82      0.88       102
                           Chemistry       0.87      0.96      0.91       260
                    Computer Science       0.87      0.89      0.88       232
                   Decision Sciences       1.00      0.20      0.33         5
                           Dentistry       0.97      0.91      0.94        70
        Earth and Planetary Sciences       0.89      0.81      0.85        58
       