# Automatic Website Migration Tool

## How to Use

1. **Prepare Data**:
   - Crawl both the live and staging websites using Screaming Frog SEO Spider.
   - Export the crawled data as CSV files.

2. **Run Cells**
   - Run each cell in sequence to progress the script. You can run all cells, but be aware it will use the default column selection when you do this.

3. **Upload Files**:
   - Upload the staging and crawl files when prompted byu the script.

4. **Column Selection**:
   - By default, the application searches for columns named 'Address', 'H1-1', and 'Title 1'. These can be manually mapped if not automatically found.
   - Users can select up to three columns for the matching process.

5. **Processing**:
   - Click the 'Process Files' button to start the comparison and matching process.

6. **Download Results**:
   - Once the processing is complete, the file will automatically be downloaded.

In [1]:
!pip install polyfuzz==0.4.2
!pip install tqdm==4.66.1
!pip install plotly==5.18.0

Collecting polyfuzz==0.4.2
  Downloading polyfuzz-0.4.2-py2.py3-none-any.whl (36 kB)
Collecting rapidfuzz>=0.13.1 (from polyfuzz==0.4.2)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rapidfuzz, polyfuzz
Successfully installed polyfuzz-0.4.2 rapidfuzz-3.5.2
Collecting plotly==5.18.0
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.15.0
    Uninstalling plotly-5.15.0:
      Successfully uninstalled plotly-5.15.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of th

In [2]:
import pandas as pd
import ipywidgets as widgets
import plotly.graph_objects as go
import urllib.parse
import threading
import numpy as np


from polyfuzz import PolyFuzz
from tqdm import tqdm
from IPython.display import display
from google.colab import files

In [3]:
print("Please upload the live file:")
uploaded_live = files.upload()

Please upload the live file:


Saving halfords_internal_html.csv to halfords_internal_html.csv


In [4]:
print("Please upload the staging file:")
uploaded_staging = files.upload()

Please upload the staging file:


Saving evans_cycles_crawl.csv to evans_cycles_crawl.csv


In [5]:
live_file_name = next(iter(uploaded_live))
staging_file_name = next(iter(uploaded_staging))

df_live = pd.read_csv(live_file_name, dtype="str")
df_staging = pd.read_csv(staging_file_name, dtype="str")

In [6]:
print("Preprocessing data...")
df_live = df_live.apply(lambda col: col.str.lower())
df_staging = df_staging.apply(lambda col: col.str.lower())


Preprocessing data...


# Select Columns to Match On
## Select the columns to match on using the drop down menu and press the 'Update selected columns' button once selected.

In [7]:
column_list = ['Optional'] + df_live.columns.tolist()

# Define the default values with error handling
default_values = {
    'Column 1': 'Address',  # Default value for Column 1
    'Column 2': 'H1-1',     # Default value for Column 2
    'Column 3': 'Title 1'   # Default value for Column 3
}

# Ensure that default values exist in the available options
for col, default_value in default_values.items():
    if default_value not in column_list:
        default_values[col] = 'Optional'

# Pre-select the columns with error handling for default values
dropdown1 = widgets.Dropdown(
    options=column_list,
    description='Column 1:',
    value=default_values['Column 1']
)
dropdown2 = widgets.Dropdown(
    options=column_list,
    description='Column 2:',
    value=default_values['Column 2']
)
dropdown3 = widgets.Dropdown(
    options=column_list,
    description='Column 3:',
    value=default_values['Column 3']
)

display(dropdown1, dropdown2, dropdown3)

# Initialize selected_columns with default values
selected_columns = [dropdown1.value, dropdown2.value, dropdown3.value]
selected_columns = [col for col in selected_columns if col != 'Optional']
print("Selected Columns:", selected_columns)

# Function to get selected columns
def get_selected_columns():
    selected = [dropdown1.value, dropdown2.value, dropdown3.value]
    # Replace missing default values with "Optional"
    selected = ['Optional' if col == '' else col for col in selected]
    return [col for col in selected if col != 'Optional']

# Update the selected columns if the dropdowns are changed
def on_dropdown_change(change):
    global selected_columns
    selected_columns = get_selected_columns()
    print("Updated Selected Columns:", selected_columns)

# Add observers to the dropdowns to update selected columns automatically
dropdown1.observe(on_dropdown_change, names='value')
dropdown2.observe(on_dropdown_change, names='value')
dropdown3.observe(on_dropdown_change, names='value')

Dropdown(description='Column 1:', index=1, options=('Optional', 'Address', 'Redirect URL', 'Status Code', 'Sta…

Dropdown(description='Column 2:', index=19, options=('Optional', 'Address', 'Redirect URL', 'Status Code', 'St…

Dropdown(description='Column 3:', index=8, options=('Optional', 'Address', 'Redirect URL', 'Status Code', 'Sta…

Selected Columns: ['Address', 'H1-1', 'Title 1']


In [30]:
def update_matching_columns(b=None):
    global matching_columns
    matching_columns = get_selected_columns()
    print("Selected columns for matching:", matching_columns)

# You can use a button or just call the function directly
update_matching_columns()

Selected columns for matching: ['Address', 'H1-1', 'Title 1']


In [31]:
print("Initializing PolyFuzz model...")
model = PolyFuzz("TF-IDF")


Initializing PolyFuzz model...


In [32]:
def match_and_score(col):
    # Handle NaN values by replacing them with an empty string
    live_list = df_live[col].fillna('').tolist()
    staging_list = df_staging[col].fillna('').tolist()

    # Perform matching only if both lists have content
    if live_list and staging_list:
        print(f"Matching {col}...")
        model.match(live_list, staging_list)
        return model.get_matches()
    else:
        return pd.DataFrame(columns=['From', 'To', 'Similarity'])

# Match each column and collect scores
print("Matching columns and collecting scores...")
matches_scores = {col: match_and_score(col) for col in tqdm(matching_columns, desc="Matching columns")}

selected_additional_columns = selected_columns[1:]


# Function to find the overall best match for each row
def find_best_overall_match_and_median(row):
    similarities = []
    best_match_info = {
        'Best Match on': None,
        'Highest Matching URL': None,
        'Highest Similarity Score': 0
    }
    additional_info = {f'Staging {col}': None for col in selected_additional_columns}

    for col in matching_columns:
        matches = matches_scores[col]
        if not matches.empty:
            match_row = matches.loc[matches['From'] == row[col]]
            if not match_row.empty:
                similarity_score = match_row.iloc[0]['Similarity']
                similarities.append(similarity_score)
                if similarity_score > best_match_info['Highest Similarity Score']:
                    best_match_info['Best Match on'] = col
                    best_match_info['Highest Matching URL'] = df_staging.loc[
                        df_staging[col] == match_row.iloc[0]['To'], 'Address'
                    ].values[0]
                    best_match_info['Highest Similarity Score'] = similarity_score
                    best_match_info['Best Match Content'] = match_row.iloc[0]['To']

                    # Capture additional staging data based on user selection
                    for add_col in selected_additional_columns:
                        additional_info[f'Staging {add_col}'] = df_staging.loc[
                            df_staging[col] == match_row.iloc[0]['To'], add_col
                        ].values[0] if add_col in df_staging else None

    # Calculate the median similarity score for the row
    best_match_info['Median Match Score'] = np.median(similarities) if similarities else None

    # Combine best match info with additional staging data
    best_match_info.update(additional_info)

    return pd.Series(best_match_info)

Matching columns and collecting scores...


Matching columns:   0%|          | 0/3 [00:00<?, ?it/s]

Matching Address...


Matching columns:  67%|██████▋   | 2/3 [00:00<00:00,  5.41it/s]

Matching H1-1...
Matching Title 1...


Matching columns: 100%|██████████| 3/3 [00:00<00:00,  5.64it/s]


In [33]:
# Apply the function to find the best overall match and calculate row-wise median match score
match_results = df_live.apply(find_best_overall_match_and_median, axis=1)

# Concatenate the match results with the original dataframe
final_columns = ['Address'] + [col for col in matching_columns if col != 'Address']
df_final = pd.concat([df_live[final_columns], match_results], axis=1)


In [34]:
print("Compiling final results...")
final_columns = ['Address'] + [col for col in matching_columns if col != 'Address']
df_final = pd.concat([df_live[final_columns], match_results], axis=1)

Compiling final results...


# Create Sankey Chart

In [35]:
def clean_name(url_part):
    return url_part.replace('/', ' ').replace('-', ' ').strip()


def extract_hierarchy_levels(url):
    parsed_url = urllib.parse.urlparse(url)
    path_parts = parsed_url.path.strip("/").split("/")

    # Exclude files (e.g., .html pages) from the path parts
    if path_parts:
        if '.' in path_parts[-1]:  # Checks if the last part looks like a file
            path_parts = path_parts[:-1]  # Remove the last part if it's a file

    if not path_parts:  # If the URL path is empty or only had a file, it's the homepage
        return ['No Path']

    # Create a hierarchy list with cleaned folder names
    hierarchy_levels = [clean_name(part) for part in path_parts]
    return hierarchy_levels


def prepare_sankey_data(df, top_x=20):
    df['Source Hierarchy'] = df['Address'].apply(extract_hierarchy_levels)
    df['Target Hierarchy'] = df['Highest Matching URL'].apply(extract_hierarchy_levels)

    # Flatten the hierarchy to create source-target pairs for all levels
    rows = []
    for _, row in df.iterrows():
        source_hierarchy = row['Source Hierarchy']
        target_hierarchy = row['Target Hierarchy']

        # Connect every source level with all subsequent levels
        for i in range(len(source_hierarchy)):
            for j in range(i + 1, len(source_hierarchy)):
                rows.append({
                    'Source Level': source_hierarchy[i],
                    'Target Level': source_hierarchy[j]
                })

        # Apply the same logic for the target hierarchy
        for i in range(len(target_hierarchy)):
            for j in range(i + 1, len(target_hierarchy)):
                rows.append({
                    'Source Level': target_hierarchy[i],
                    'Target Level': target_hierarchy[j]
                })
    sankey_data = pd.DataFrame(rows)

    # Aggregate and count the source-target pairs
    sankey_data = sankey_data.groupby(['Source Level', 'Target Level']).size().reset_index(name='Count')

    # Get top mappings by count for Sankey Chart
    top_mappings = sankey_data.nlargest(top_x, 'Count')
    return top_mappings


def create_sankey_chart(sankey_data):
    # Generate labels for each unique level in the hierarchy
    labels = sorted(set(
        label for pair in zip(sankey_data['Source Level'], sankey_data['Target Level']) for label in pair
    ), key=lambda x: ('No Path' not in x, x))
    level_index = {level: idx for idx, level in enumerate(labels)}

    # Map levels to indices using the cleaned names
    source_indices = [level_index[level] for level in sankey_data['Source Level']]
    target_indices = [level_index[level] for level in sankey_data['Target Level']]
    weights = sankey_data['Count']

    # Create Sankey diagram with hierarchical data
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,  # Use the cleaned labels for node names
            color="blue"
        ),
        link=dict(
            source=target_indices,
            target=source_indices,
            value=weights
        ),
        # Set the layout for node alignment
        arrangement='snap'
    )])

    fig.update_layout(title_text='Top 20 Folder Mappings')
    return fig

In [36]:
print("Preparing data for Sankey Chart...")
sankey_data = prepare_sankey_data(df_final)  # Use your final processed dataframe
sankey_chart = create_sankey_chart(sankey_data)

# Display the Sankey Chart
sankey_chart.show()

Preparing data for Sankey Chart...


# Download the Data

In [37]:
print("Saving final results...")

# Drop the 'Source Hierarchy' and 'Target Hierarchy' columns from the dataframe
df_final.drop(['Source Hierarchy', 'Target Hierarchy'], axis=1, inplace=True)

# Save the updated DataFrame without the dropped columns to a CSV file
file_name = "matched_data.csv"
df_final.to_csv(file_name, index=False)

# Download the CSV file
files.download(file_name)

print(f"{file_name} has been downloaded successfully.")

Saving final results...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

matched_data.csv has been downloaded successfully.
