***
## ***Step 1 -Fetching Technology and Netloc From Links Fetched***
***

### Importing Libraries

In [19]:
import pandas as pd
from urllib.parse import urlparse
import re
import tldextract

### Importing the links data

In [20]:
df = pd.read_csv('Step 0 - Raw Data/Website_Technography_Relationship.csv')

In [21]:
df.head()

Unnamed: 0,File Name,Link,Status
0,02june-fsa.com,https://static.wixstatic.com/media/,Present
1,02june-fsa.com,https://static.wixstatic.com/media/0525b5_c4e2...,Present
2,02june-fsa.com,./#comp-jbgap0ye,Present
3,02june-fsa.com,https://static.parastorage.com/unpkg/focus-wit...,Present
4,02june-fsa.com,https://panorama.wixapps.net/api/v1/bulklog,Present


***
***

# Raw Data to Netloc Unclean Table

### Cleaning the File Name and Link Columns

In [22]:
df_new = df.copy()

In [23]:
df_new['File Name'] = df_new['File Name'].str.lower()

In [24]:
df_new['Link'] = df_new['Link'].str.lower()

### Extrating Netloc Column

In [25]:
def extract_netloc(url):
    try:
        parsed_url = urlparse(str(url))
        original_netloc = parsed_url.netloc
        
        # If original_netloc is empty, consider it invalid
        if not original_netloc:
            return None
        
        return original_netloc
    except Exception as e:
        return None  # Return None instead of an empty string

def netloc_status(netloc):
    if netloc is None:
        return 'Absent'
    return 'Present'

In [26]:
# Create 'Netloc' column by applying 'extract_netloc' function
df_new['Netloc'] = df_new['Link'].apply(extract_netloc)

# Create 'Technology_Status' column by applying 'technology_status' function to the 'Netloc' column
df_new['Netloc_Status'] = df_new['Netloc'].apply(netloc_status)

In [27]:
df_new.head()

Unnamed: 0,File Name,Link,Status,Netloc,Netloc_Status
0,02june-fsa.com,https://static.wixstatic.com/media/,Present,static.wixstatic.com,Present
1,02june-fsa.com,https://static.wixstatic.com/media/0525b5_c4e2...,Present,static.wixstatic.com,Present
2,02june-fsa.com,./#comp-jbgap0ye,Present,,Absent
3,02june-fsa.com,https://static.parastorage.com/unpkg/focus-wit...,Present,static.parastorage.com,Present
4,02june-fsa.com,https://panorama.wixapps.net/api/v1/bulklog,Present,panorama.wixapps.net,Present


In [28]:
df_new.nunique()

File Name          51644
Link             4421980
Status                 2
Netloc            160479
Netloc_Status          2
dtype: int64

### Exporting Unclean table

df_new.to_csv('Step 1 - Links to Unclean Netloc Table/Unclean_Company_Technography_Relationship.csv', index=False)

***
***

# Creating a New Table having unique combnaion of File Name and Netloc

In [31]:
df_new[df_new['Netloc'].notnull()][['File Name', 'Netloc']].drop_duplicates().nunique()

File Name     50314
Netloc       160479
dtype: int64

In [32]:
# Extract unique combinations of 'File Name' and 'Netloc', excluding null 'Netloc'
df_unique = df_new[df_new['Netloc'].notnull()][['File Name', 'Netloc']].drop_duplicates()

In [33]:
# Add a new column based on substring comparison
df_unique['Netloc_Type'] = df_unique.apply(lambda row: 'Internal' if row['File Name'] in row['Netloc'] else 'External', axis=1)

In [34]:
df_unique.nunique()

File Name       50314
Netloc         160479
Netloc_Type         2
dtype: int64

In [35]:
df_unique.to_csv('Step 1 - Links to Unclean Netloc Table/Unclean_Company_Link_Relationship.csv', index=False)

***
# ***---------------------- END OF STEP 1 ---------------------***
***

# Creating Netloc Table from File Name Netloc Relationship Table

In [25]:
# Create a new DataFrame with unique 'Netloc' values where 'Netloc_Type' is 'External'
netloc_table = df_unique[df_unique['Netloc_Type'] == 'External']['Netloc'].drop_duplicates().reset_index(drop=True).to_frame()

In [26]:
netloc_table

Unnamed: 0,Netloc
0,static.wixstatic.com
1,static.parastorage.com
2,panorama.wixapps.net
3,www.instagram.com
4,frog.wix.com
...,...
105994,techvorm.com
105995,zyppysimages.s3.ap-south-1.amazonaws.com
105996,www.zzcraftsman.com
105997,ppnp.ac.id


## Fetching Domain and Subdomain Columns

In [27]:
# Function to extract subdomain and second-level domain (SLD)
def extract_subdomain_sld(netloc):
    ext = tldextract.extract(netloc)
    return pd.Series([ext.subdomain, ext.domain])

In [28]:
# Apply the function to the DataFrame
netloc_table[['Subdomain', 'Domain']] = netloc_table['Netloc'].apply(extract_subdomain_sld)

KeyboardInterrupt: 

In [None]:
netloc_table

Unnamed: 0,Netloc,Subdomain,Domain
0,static.wixstatic.com,static,wixstatic
1,static.parastorage.com,static,parastorage
2,panorama.wixapps.net,panorama,wixapps
3,www.instagram.com,www,instagram
4,frog.wix.com,frog,wix
...,...,...,...
105994,techvorm.com,,techvorm
105995,zyppysimages.s3.ap-south-1.amazonaws.com,zyppysimages.s3.ap-south-1,amazonaws
105996,www.zzcraftsman.com,www,zzcraftsman
105997,ppnp.ac.id,,ppnp


In [None]:
netloc_table.to_csv('Step 3 - Creating Netloc Table\\Netloc_Table.csv', index=False)

# Clean the Unclean Relationship Data

### Removing Duplicates from the Data

In [None]:
df_without_duplicates = df.drop_duplicates(subset=['File Name', 'Link'], keep='first')

### Cleaning Function of Technology Column

In [None]:
def technology_cleaning_function(df):
    # Ensure 'Technology' column exists in the DataFrame
    if 'Technology' in df.columns:
        # Define a list of patterns to remove
        patterns_to_remove = ["www", "www')", "www'", "www<", "www&", "www')+"]

        # Remove rows where 'Technology' column matches any pattern in patterns_to_remove
        df = df[~df['Technology'].isin(patterns_to_remove)]
        
        # Remove substrings that start with "|www."
        df = df[~df['Technology'].str.contains(r'\|www\.', case=False, na=False)]
        
        # Remove special characters from start or end of values in 'Technology' column
        df['Technology'] = df['Technology'].apply(lambda x: re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', str(x)))
    
    return df

In [None]:
cleaned_df_without_duplicates = technology_cleaning_function(df_without_duplicates)

### Exporting Clean Realtionship Table

In [None]:
cleaned_df_without_duplicates.to_csv('Step 2 - Unclean to Clean relationship Table/Clean_Company_Technography_Relationship.csv', index=False)

# Transformation of Cleaned Data for scoring Purposes

In [None]:
cleaned_df_without_duplicates[cleaned_df_without_duplicates['Technology'].str.contains("connect.facebook")]['Technology'].unique()

array(['connect.facebook.net', 'connect.facebook.net><script',
       'connect.facebook.com', 'connect.facebook.net><link',
       'static.ak.connect.facebook.com'], dtype=object)

In [None]:
cleaned_df_without_duplicates.nunique()

File Name              51644
Link                 4419960
Status                     2
Netloc                158463
Technology            147670
Technology_Status          2
dtype: int64

# Creating the Technology Details table from Relationship Table

In [None]:
Technology_Details = pd.DataFrame(cleaned_df_without_duplicates['Technology'].drop_duplicates().reset_index(drop=True), columns=['Technology'])

In [None]:
Technology_Details.to_csv('Step 4 - Relationship Table To Technology Details Table/Technology_Details.csv', index=False)