# Configurate Session


In [2]:
%load_ext autoreload

import os
import sys

project_root = "/Users/carolinapinto/desktop/NL/Mainproject/Natural-Language"
if project_root not in sys.path:
    sys.path.append(project_root)
os.chdir(project_root)

print("Current working directory:", os.getcwd())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current working directory: /Users/carolinapinto/Desktop/NL/Mainproject/Natural-Language


# Load Data


In [112]:
%autoreload 2
from src.dataset import read_data

path = "data/raw/train.txt"
columns = ["title", "from", "genre", "director", "description"]

df = read_data(path, columns)
df.head(3)

Unnamed: 0,title,from,genre,director,description
0,Ela Cheppanu,Telugu,romance,Ramana,Sekhar (Tarun) is a graduate from IIM and work...
1,A Nightmare on Elm Street,American,horror,Samuel Bayer,Kris Fowles (Katie Cassidy) goes to the Spring...
2,American Gothic,American,horror,John Hough,Cynthia is traumatized by the death of her bab...


# EDA


### First Impressions


In [113]:
%autoreload 2
print(df.info())
print("*" * 20)
repeated_titles = df["title"].value_counts()[df["title"].value_counts() > 1].head(5)
print(repeated_titles)
print("*" * 20)
popular_directors = df["director"].value_counts().head(5)
print(popular_directors)
print("*" * 20)
print(df["from"].value_counts().head(5))
print("*" * 20)
print("Number of duplicates:", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8041 entries, 0 to 8040
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        8041 non-null   object
 1   from         8041 non-null   object
 2   genre        8041 non-null   object
 3   director     8041 non-null   object
 4   description  8041 non-null   object
dtypes: object(5)
memory usage: 314.2+ KB
None
********************
title
Dracula               4
The Crimson Circle    3
Hindle Wakes          3
Romance               3
I Love You            3
Name: count, dtype: int64
********************
director
Unknown            178
Hanna-Barbera       46
Lesley Selander     32
Maurice Elvey       28
Joseph Kane         23
Name: count, dtype: int64
********************
from
American     4050
British      1415
Bollywood     632
Telugu        276
Tamil         261
Name: count, dtype: int64
********************
Number of duplicates: 18


### Drop Duplicates


In [55]:
df[df.duplicated(keep=False)].sort_values("title").head(20)

Unnamed: 0,title,from,genre,director,description
7102,At Gunpoint,American,western,Alfred L. Werker,"Plainview is a peaceful town, all the better f..."
4053,At Gunpoint,American,western,Alfred L. Werker,"Plainview is a peaceful town, all the better f..."
3121,Black Rock,American,horror,Katie Aselton,Sarah (Kate Bosworth) invites her childhood fr...
4714,Black Rock,American,horror,Katie Aselton,Sarah (Kate Bosworth) invites her childhood fr...
6456,Captain America,American,action,Albert Pyun,"In Fascist Italy in 1936, the government kidna..."
6285,Captain America,American,action,Albert Pyun,"In Fascist Italy in 1936, the government kidna..."
226,Creep,British,horror,Christopher Smith,Arthur (Ken Campbell) and George (Vas Blackwoo...
4126,Creep,British,horror,Christopher Smith,Arthur (Ken Campbell) and George (Vas Blackwoo...
6576,Drive-In Massacre,American,horror,Stu Segall,A couple go to a drive-in theater in a rural C...
4563,Drive-In Massacre,American,horror,Stu Segall,A couple go to a drive-in theater in a rural C...


In [114]:
df.drop_duplicates(inplace=True)
# Reset index to ensure continuous indices
df = df.reset_index(drop=True)

### Plots


In [None]:
%autoreload 2

from src.plots import plot_movie_data

plot_movie_data(df)

In [None]:
%autoreload 2

from src.plots import plot_stopword_frequency

plot_stopword_frequency(df, "description")

In [None]:
plot_stopword_frequency(df, "title")

In [9]:
%autoreload 2
from src.plots import get_text_statistics
stats = get_text_statistics(df, 'description')

In [None]:
%autoreload 2
from src.plots import plot_histograms
plot_histograms(stats)

In [None]:
%autoreload 2
from src.plots import plot_boxplots
plot_boxplots(stats)

In [None]:
%autoreload 2
from sklearn.preprocessing import StandardScaler
from src.plots import plot_correlation_matrix, apply_pca
# Standardize the data outside the PCA function
scaler = StandardScaler()
scaled_data = scaler.fit_transform(stats)

plot_correlation_matrix(stats)
pca_df, pca = apply_pca(scaled_data, df['genre'])

### Standardize Directors

In [115]:
from collections import defaultdict
import re
from src.directors import create_name_map, map_director_names

#Create the mapping dictionary that maps the original director name to a standardized version
name_map = create_name_map(df)
#Map the names
map_director_names(df, name_map)

In [14]:
filtered_name_map = {key: value for key, value in name_map.items() if len(value) > 1}

In [15]:
filtered_name_map

{'kcbokadia': {'K C Bokadia', 'K. C. Bokadia'},
 'hannabarbera': {'Hanna Barbera', 'Hanna-Barbera'},
 'rajkumarkohli': {'Raj Kumar Kohli', 'Rajkumar Kohli'},
 'rgspringsteen': {'R. G. Springsteen', 'R.G. Springsteen'},
 '3directors': {'3 Directors', '3 directors'},
 'chigurudutt': {'Chi Guru Dutt', 'Chi. Guru Dutt'},
 'wsvandyke': {'W. S. Van Dyke', 'W.S. Van Dyke'},
 'andredetoth': {'Andre DeToth', 'Andre de Toth'},
 'tlvprasad': {'T. L. V. Prasad', 'T.L.V. Prasad'},
 'yuenwooping': {'Yuen Woo Ping', 'Yuen Woo-ping'},
 'kbalachander': {'K. Balachander', 'K.Balachander'},
 'brchopra': {'B. R. Chopra', 'B.R. Chopra'},
 'poonmankit': {'Poon Man Kit', 'Poon Man-kit'},
 'pramodpappan': {'Pramod - Pappan', 'Pramod Pappan'},
 'vvvinayak': {'V. V. Vinayak', 'V.V. Vinayak'},
 'abhimsingh': {'A. Bhim Singh', 'A. Bhimsingh'},
 'dwgriffith': {'D. W. Griffith', 'D.W. Griffith'},
 'johnmackenzie': {'John MacKenzie', 'John Mackenzie'},
 'vkprakash': {'V. K. Prakash', 'V.K.Prakash'},
 'xuexiaolu': {'

In [59]:
df.head(5)

Unnamed: 0,title,from,genre,director,description
0,Ela Cheppanu,Telugu,romance,ramana,Sekhar (Tarun) is a graduate from IIM and work...
1,A Nightmare on Elm Street,American,horror,samuelbayer,Kris Fowles (Katie Cassidy) goes to the Spring...
2,American Gothic,American,horror,johnhough,Cynthia is traumatized by the death of her bab...
3,Gang,Bollywood,crime,mazharkhan,"Four friends, Gangu (Jackie Shroff), Abdul (Na..."
4,Intimate Relations,British,drama,charlesfrank,Crisis in a middle-class family when the son f...


### Duplicate Candidates


In [116]:
%autoreload 2

from src.dataset import filter_duplicate_descriptions

filtered_df=filter_duplicate_descriptions(df, "description", "title")
filtered_df

Unnamed: 0,title,from,genre,director,description
5741,Black Dalia,Malayalam,action,baburaj,A student of Sacred Heart Medical College is f...
4028,Anthima Theerpu,Telugu,action,advbaburaj,A student of Sacred Heart Medical College is f...
435,Bhayya,Telugu,action,boopathypandian,Anbu (Vishal) is a happy-go-lucky engineering ...
7419,Malaikottai,Tamil,action,boopathypandian,Anbu (Vishal) is a happy-go-lucky engineering ...
3026,The Protector,Hong Kong,action,jamesglickenhaus,"As noted above, Jackie Chan re-edited The Prot..."
7693,"Protector, The",American,action,jamesglickenhaus,"As noted above, Jackie Chan re-edited The Prot..."
3036,36 Hours,British,crime,montgomerytully,"Bill Rogers (Dan Duryea), an American jet pilo..."
2444,Terror Street,American,crime,montgomerytully,"Bill Rogers (Dan Duryea), an American jet pilo..."
6766,Harry Tracy,American,western,williamagraham,"By the end of the 19th century, Butch Cassidy,..."
1452,"Harry Tracy, Desperado",Canadian,western,williamgraham,"By the end of the 19th century, Butch Cassidy,..."


In [117]:
num_rows = filtered_df.shape[0]
print(f"Number of rows with duplicate descriptions: {num_rows}")

Number of rows with duplicate descriptions: 55


In [118]:
%autoreload 2
from src.dataset import find_similar_descriptions

similar_pairs = find_similar_descriptions(df, "description")


In [88]:
print(similar_pairs)

[(6, 5948, np.float64(1.0000000000000002), 1.0), (95, 1641, np.float64(1.0), 1.0), (435, 7419, np.float64(1.0000000000000004), 1.0), (578, 7355, np.float64(1.0000000000000002), 1.0), (819, 1009, np.float64(1.0000000000000002), 1.0), (923, 1591, np.float64(1.0), 1.0), (974, 5102, np.float64(1.0), 1.0), (1043, 4839, np.float64(0.9999999999999999), 1.0), (1056, 2826, np.float64(1.0000000000000002), 1.0), (1368, 7830, np.float64(0.9999999999999999), 1.0), (1452, 6766, np.float64(1.0000000000000002), 1.0), (1517, 1760, np.float64(1.0000000000000002), 1.0), (1517, 2215, np.float64(1.0000000000000002), 1.0), (1546, 7612, np.float64(1.0000000000000002), 1.0), (1760, 2215, np.float64(1.0000000000000002), 1.0), (1941, 4509, np.float64(1.0000000000000002), 1.0), (1967, 3454, np.float64(1.0000000000000002), 1.0), (2088, 3987, np.float64(1.0000000000000002), 1.0), (2307, 2316, np.float64(1.0), 1.0), (2399, 5016, np.float64(0.9999999999999998), 1.0), (2444, 3036, np.float64(1.0000000000000002), 1.0)

In [119]:
length_of_similar_pairs = len(similar_pairs)
print("Number of similar pairs:", length_of_similar_pairs)

Number of similar pairs: 49


In [83]:
print(df.loc[[1370, 1648]])

                  title      from      genre       director  \
1370  Rabbit of Seville  American  animation  charlesmjones   
1648         Hare Brush  American  animation    frizfreleng   

                                            description  
1370  The cartoon opens with people filing in to see...  
1648  In the boardroom of the Elmer J. Fudd Corporat...  


In [65]:
print(df.loc[[95, 1641], ["description"]])

                                            description
95    The film opens with two people – a man (Aravin...
1641  The film opens with two people – a man (Aravin...


In [None]:
%autoreload 2
from src.dataset import print_differences

print_differences(df, similar_pairs, "title")
print("\n")
print_differences(df, similar_pairs, "director")
print("\n")
print_differences(df, similar_pairs, "genre")
print("\n")
print_differences(df, similar_pairs, "from")

In [122]:
from src.dataset import validate_and_filter_duplicates_fuzzy
validate_and_filter_duplicates_fuzzy(df, similar_pairs, ["director"], threshold=80)

In [130]:
%autoreload 2

from src.dataset import filter_duplicate_descriptions

filter_duplicate_descriptions(df, "description", "title")

Unnamed: 0,title,from,genre,director,description
2399,Mr. Bug Goes to Town,American,animation,davefleischer,"Hoppity the Grasshopper, after a period spent ..."
5016,Mister Bug Goes to Town,American,animation,unknown,"Hoppity the Grasshopper, after a period spent ..."
6850,Abhimanyu,Kannada,action,arjunsarja,"Jaihind is the journey of five people, linked ..."
7489,Jaihind 2,Tamil,action,arjun,"Jaihind is the journey of five people, linked ..."
2307,Nayagi,Tamil,horror,govi,Nayaki is a horror movie . Nayaki is about Gay...
2316,Nayaki,Telugu,horror,goverdhanreddy,Nayaki is a horror movie . Nayaki is about Gay...
6194,Agni,Tamil,romance,ajrharikesava,Tanisha is a young daughter of a man working f...
7589,Agnee,Bangladeshi,action,iftakarchowdhury,Tanisha is a young daughter of a man working f...
3686,Library War: The Wings of Revolution,Japanese,animation,takayukihamana,The background of the plot is based on the Sta...
4176,Toshokan Sensō,Japanese,sci-fi,shinsukesato,The background of the plot is based on the Sta...


In [129]:
num_rows = filter_df2.shape[0]
print(f"Number of rows with duplicate descriptions: {num_rows}")

Number of rows with duplicate descriptions: 14


### Candidate Duplicates

In [29]:
import pandas as pd
import re
from rapidfuzz import fuzz, process  # Using rapidfuzz for fast fuzzy matching
import unicodedata


def preprocess_and_find_merge_candidates(df, director_column, threshold=85):
    """
    Preprocesses director names and finds candidates for merging based on fuzzy matching.

    Parameters:
    df (pd.DataFrame): The original DataFrame containing director names.
    director_column (str): The column name containing the director names.
    threshold (int): Similarity score threshold for fuzzy matching (default: 85).

    Returns:
    pd.DataFrame: A DataFrame with potential merge candidates based on the similarity score,
                  including the original director names.
    """

    # Step 1: Preprocess the names (lowercase, normalize, and strip extra spaces)
    def preprocess_name(name):
        # Normalize and remove accents
        name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode("utf-8")
        # Remove extra spaces and commas, convert to lowercase
        name = re.sub(r"\s+", " ", name.strip())  # Replace multiple spaces with single space
        name = re.sub(r"[^\w\s,]", "", name)  # Remove special characters, keep commas

        # Remove single or double initials (e.g., "M.", "R A", "S.")
        name = re.sub(r"\b(?:[A-Z](?:\.|\s)){1,2}\b", "", name)  # Matches 1 or 2 initials with/without period

        return name.lower()

    # Step 2: Preprocess and split multiple names in the column
    df_temp = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
    df_temp["processed_name"] = df_temp[director_column].apply(preprocess_name)
    df_temp["split_names"] = df_temp["processed_name"].apply(lambda x: x.split(","))

    # Track the original names before exploding
    df_temp["original_name"] = df[director_column]

    # Explode the DataFrame so that each name is on a separate row
    df_exploded = df_temp.explode("split_names")
    df_exploded["split_names"] = df_exploded["split_names"].str.strip()  # Strip extra spaces

    # Step 3: Define a function to find candidates for merging using fuzzy matching
    def find_merge_candidates(df, threshold=85):
        """
        Finds names that are candidates for merging based on fuzzy matching.
        """
        names = df["split_names"].unique()
        merge_candidates = []

        for name in names:
            # Get a list of all potential matches that exceed the threshold
            matches = process.extract(name, names, scorer=fuzz.ratio, limit=None)
            for match_name, score, _ in matches:  # Here we unpack the third value but ignore it
                if name != match_name and score >= threshold:
                    original_name = df[df["split_names"] == name]["original_name"].values[0]
                    potential_merge_original = df[df["split_names"] == match_name]["original_name"].values[0]
                    merge_candidates.append((original_name, name, potential_merge_original, match_name, score))

        return pd.DataFrame(
            merge_candidates,
            columns=[
                "Original Name",
                "Processed Name",
                "Potential Merge Original",
                "Potential Merge Processed",
                "Similarity Score",
            ],
        )

    # Step 4: Get the merge candidates DataFrame
    merge_candidates_df = find_merge_candidates(df_exploded, threshold)

    # Step 5: Sort the results for better readability
    merge_candidates_df.sort_values(by="Similarity Score", ascending=False, inplace=True)

    return merge_candidates_df

In [None]:
merge_candidates = preprocess_and_find_merge_candidates(df, "director", threshold=95)

In [None]:
merge_candidates.loc[:, ["Original Name", "Potential Merge Original", "Similarity Score"]].iloc[::2]

In [None]:
df.loc[(df["director"] == "Mrighdeep Singh Lamba") | (df["director"] == "Mrigdeep Singh Lamba")]

In [None]:
from collections import defaultdict


def clean_director_name(name: str):
    name = name.lower().replace(" ", "").replace("-", "")
    return re.sub(r"\.", "", name)


def create_name_map(df: pd.DataFrame):
    name_map = defaultdict(set)

    for i, row in df.iterrows():
        director_list = [name.strip() for name in row["director"].split(",")]

        for director in director_list:
            cleaned_name = clean_director_name(director)
            name_map[cleaned_name].add(director)

    return name_map


director_name_map = create_name_map(df)
filtered_name_map = {key: value for key, value in director_name_map.items() if len(value) > 1}


print("Directors with more than 1 value:")
filtered_name_map

In [16]:
import numpy as np

df.loc[df["director"] == "3 directors" "Director"] = np.nan
df.loc[:, "director"] = df.loc[:, "director"].replace("Unknown", np.nan)

# Preprocessing (NEEDS WORK!)


In [17]:
%autoreload 2
from src.dataset import lemmatize_tokens, extract_noun_phrases, word_tokenize
import string


def preprocess_sentence(sentence: str) -> str:
    """Preprocess the sentence by tokenizing, lemmatizing, and joining noun phrases."""
    cleaned_sentence = sentence.translate(str.maketrans("", "", string.punctuation)).lower()
    tokens = word_tokenize(cleaned_sentence)
    noun_phrases = extract_noun_phrases(sentence)
    noun_phrases_joined = ["".join(phrase.split()) for phrase in noun_phrases]
    lemmatized_tokens = lemmatize_tokens(tokens)
    combined_tokens = list(set(lemmatized_tokens + noun_phrases_joined))
    return " ".join(combined_tokens)


df["title"] = df["title"].apply(preprocess_sentence)
df["description"] = df["description"].apply(preprocess_sentence)

# Feature Engineering


In [16]:
df["region"] = df["from"].map(
    {
        "American": "Western",
        "British": "Western",
        "Canadian": "Western",
        "Australian": "Western",
        "Bollywood": "South Asian",
        "Telugu": "South Asian",
        "Tamil": "South Asian",
        "Malayalam": "South Asian",
        "Bengali": "South Asian",
        "Kannada": "South Asian",
        "Marathi": "South Asian",
        "Punjabi": "South Asian",
        "Assamese": "South Asian",
        "Chinese": "East Asian",
        "Japanese": "East Asian",
        "South_Korean": "East Asian",
        "Hong Kong": "East Asian",
        "Filipino": "Southeast Asian",
        "Bangladeshi": "South Asian",
        "Russian": "European",
        "Turkish": "Middle Eastern",
        "Egyptian": "Middle Eastern",
        "Malaysian": "Southeast Asian",
    }
)

In [18]:
%autoreload 2

from src.dataset import filter_duplicate_descriptions

filter_duplicate_descriptions(df, "description", "title")

Unnamed: 0,title,from,genre,director,description,region
5748,Black Dalia,Malayalam,action,baburaj,A student of Sacred Heart Medical College is f...,South Asian
4030,Anthima Theerpu,Telugu,action,advbaburaj,A student of Sacred Heart Medical College is f...,South Asian
435,Bhayya,Telugu,action,boopathypandian,Anbu (Vishal) is a happy-go-lucky engineering ...,South Asian
7436,Malaikottai,Tamil,action,boopathypandian,Anbu (Vishal) is a happy-go-lucky engineering ...,South Asian
3028,The Protector,Hong Kong,action,jamesglickenhaus,"As noted above, Jackie Chan re-edited The Prot...",East Asian
7711,"Protector, The",American,action,jamesglickenhaus,"As noted above, Jackie Chan re-edited The Prot...",Western
3038,36 Hours,British,crime,montgomerytully,"Bill Rogers (Dan Duryea), an American jet pilo...",Western
2445,Terror Street,American,crime,montgomerytully,"Bill Rogers (Dan Duryea), an American jet pilo...",Western
6780,Harry Tracy,American,western,williamagraham,"By the end of the 19th century, Butch Cassidy,...",Western
1452,"Harry Tracy, Desperado",Canadian,western,williamgraham,"By the end of the 19th century, Butch Cassidy,...",Western


# Feature Selection


In [19]:
%autoreload 2
from src.logratioanalysis import LogRatioAnalysis

logratio_title = LogRatioAnalysis(df, "title", "genre")
logratio_description = LogRatioAnalysis(df, "description", "genre")

In [None]:
%autoreload 2
from src.logratioanalysis import plot_scree_subplots_for_genres

genres = df.genre.unique()
plot_scree_subplots_for_genres(logratio_title)

In [None]:
%autoreload 2
plot_scree_subplots_for_genres(logratio_description)

In [22]:
%autoreload 2

description_tokens = logratio_description.feature_selection(25000)
title_tokens = logratio_title.feature_selection(1000)

In [23]:
import nltk


def select_tokens(text, selected_tokens, tokenizer=nltk.word_tokenize):
    """
    Cleans a single document by keeping only the tokens present in the selected_tokens set.

    Parameters:
    text (str): The text document to clean.
    selected_tokens (set or list): The set or list of tokens to retain in the text.
    tokenizer (function): A function to tokenize the text (defaults to nltk.word_tokenize).

    Returns:
    str: The cleaned text with only the selected tokens.
    """

    if isinstance(text, str):
        tokens = tokenizer(text)
        filtered_tokens = [token for token in tokens if token in selected_tokens]
        return " ".join(filtered_tokens)
    return text

In [24]:
# Didn't Work :')
# df['cleaned_description'] = df["description"].apply(select_tokens, selected_tokens=description_tokens)

# Modelling


In [28]:
%autoreload 2
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["genre"] = label_encoder.fit_transform(df["genre"])
df["director"] = df["director"].fillna("")

X_train = df.drop("genre", axis=1)
y_train = df["genre"]

In [None]:
%autoreload 2
from mlxtend.feature_selection import ColumnSelector
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from skrub import SelectCols, SimilarityEncoder

text_pipeline = make_union(
    make_pipeline(
        ColumnSelector("title", drop_axis=True),
        TfidfVectorizer(ngram_range=(2, 4)),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(
        ColumnSelector("description", drop_axis=True),
        TfidfVectorizer(),
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
    make_pipeline(SelectCols("region"), SimilarityEncoder()),
    make_pipeline(SelectCols("from"), OneHotEncoder(sparse_output=False)),
    make_pipeline(
        ColumnSelector("director", drop_axis=True),
        TfidfVectorizer(ngram_range=(1, 2)),  #
        StandardScaler(with_mean=False),
        TruncatedSVD(),
    ),
)

pipeline = make_pipeline(text_pipeline, HistGradientBoostingClassifier())
pipeline

In [None]:
from scipy.stats import loguniform, randint
from sklearn.model_selection import RandomizedSearchCV


# Update param_distributions to match HistGradientBoostingClassifier
param_distributions = {
    "histgradientboostingclassifier__learning_rate": loguniform(0.03, 0.07),
    "histgradientboostingclassifier__max_iter": randint(250, 350),
    "histgradientboostingclassifier__max_depth": randint(4, 6),
    "histgradientboostingclassifier__min_samples_leaf": randint(85, 95),
    "histgradientboostingclassifier__max_leaf_nodes": randint(120, 140),
    "histgradientboostingclassifier__l2_regularization": loguniform(0.0005, 0.003),
    "featureunion__pipeline-1__truncatedsvd__n_components": randint(200, 400),
    "featureunion__pipeline-2__truncatedsvd__n_components": randint(800, 2000),
    "featureunion__pipeline-5__truncatedsvd__n_components": randint(200, 400),
}

# Set up the RandomizedSearchCV with the updated parameters
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    scoring="accuracy",
    random_state=42,
    verbose=4,
    error_score="raise",
)

# Fit the RandomizedSearchCV with the updated pipeline
random_search.fit(X_train, y_train)

# Get the best parameters, score, and model
best_params = random_search.best_params_
best_score = random_search.best_score_
best_model = random_search.best_estimator_

print("\n Best Parameters:", best_params)
print("\n Best Score:", best_score)