In [1]:
import pandas as pd
import requests as req
import bs4

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Research content/data/legit_features.csv")

In [3]:
df.head()

Unnamed: 0,url,words
0,http://www.crestonwood.com/router.php,6
1,http://rgipt.ac.in,2
2,http://www.mutuo.it,4
3,http://vamoaestudiarmedicina.blogspot.com/,4
4,https://www.astrologyonline.eu/Astro_MemoNew/P...,7


In [4]:
df['no_of_forms'] = ''
df['no_of_iframes'] = ''
df['missing_title'] = ''
df['title_length'] = ''
df['favicon'] = ''
df['live'] = ''

In [None]:
# prompt: iterate through the df for each of the urls and extract the number of forms in the webpage using requests and bs4 and add a new column to the df named "no_of_forms"
# Assuming df is your DataFrame

for i in range(len(df)):
    try:
        url = df['url'][i]
        response = req.get(url, timeout=3)

        if response.status_code != 200:
            raise Exception()

        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        forms = soup.find_all('form')
        iframes = soup.find_all('iframe')

        # Use loc to update DataFrame columns at specific indices
        df.loc[i, 'no_of_forms'] = len(forms)
        df.loc[i, 'no_of_iframes'] = len(iframes)
        df.loc[i, 'live'] = 1

        if soup.title and soup.title.string is not None:
            df.loc[i, 'missing_title'] = 0
            df.loc[i, 'title_length'] = len(soup.title.string.split())
        else:
            df.loc[i, 'missing_title'] = 1
            df.loc[i, 'title_length'] = 0

        favicon_tag = soup.find('link', rel='icon') or soup.find('link', rel='shortcut icon')
        if favicon_tag and 'href' in favicon_tag.attrs:
            df.loc[i, 'favicon'] = 1
        else:
            df.loc[i, 'favicon'] = 0

        print(len(forms), len(iframes), soup.title.string)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        # Update DataFrame for the current index in case of an exception
        df.loc[i, 'no_of_forms'] = 0
        df.loc[i, 'no_of_iframes'] = 0
        df.loc[i, 'missing_title'] = 1
        df.loc[i, 'title_length'] = 0
        df.loc[i, 'favicon'] = 0
        df.loc[i, 'live'] = 0



In [7]:
df.head()

Unnamed: 0,url,words,no_of_forms,no_of_iframes,missing_title,title_length,favicon,live
0,http://www.crestonwood.com/router.php,6,0,0,1,0,0,0
1,http://rgipt.ac.in,2,1,0,0,12,1,1
2,http://www.mutuo.it,4,1,0,0,6,1,1
3,http://vamoaestudiarmedicina.blogspot.com/,4,0,0,0,4,1,1
4,https://www.astrologyonline.eu/Astro_MemoNew/P...,7,0,0,0,5,0,1


In [8]:
df.to_csv("1_legit.csv", index=False)

In [9]:
df.sort_values(by=['live'])

Unnamed: 0,url,words,no_of_forms,no_of_iframes,missing_title,title_length,favicon,live
0,http://www.crestonwood.com/router.php,6,0,0,1,0,0,0
661,https://systterpuessen.files.wordpress.com/201...,13,0,0,1,0,0,0
652,https://www.diecarfactory.de,5,0,0,1,0,0,0
2845,http://www.shabakeasabi.ir/index.php?do=home,4,0,0,1,0,0,0
2839,http://flint.cs.yale.edu/cs422/doc/art-of-asm/...,4,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
1547,http://nerdsmagazine.com/alternative-google-se...,11,2,0,0,11,1,1
1548,https://www.regielive.ro/,4,3,0,0,5,0,1
1549,http://www.slideserve.com/ronli/software-as-a-...,12,1,0,0,15,1,1
1535,https://www.youtube.com/watch?v=a9qD50wsQCo,6,0,1,0,14,1,1


In [10]:
online = df[df.live == 1]

In [11]:
online.shape

(4151, 8)

In [17]:
online.to_csv("1_legit.csv", index=False)

In [15]:
online['status'] = '0'

In [14]:
online = online.drop(columns=['live'])

In [16]:
online.head()

Unnamed: 0,url,words,no_of_forms,no_of_iframes,missing_title,title_length,favicon,status
1,http://rgipt.ac.in,2,1,0,0,12,1,0
2,http://www.mutuo.it,4,1,0,0,6,1,0
3,http://vamoaestudiarmedicina.blogspot.com/,4,0,0,0,4,1,0
4,https://www.astrologyonline.eu/Astro_MemoNew/P...,7,0,0,0,5,0,0
5,https://www.lifewire.com/tcp-port-21-818146,6,4,2,0,10,1,0


In [None]:
import textstat

def get_text_statistics(text):
    # Get the Flesch-Kincaid Grade level
    fk_grade = textstat.flesch_kincaid_grade(text)

    # Get the Gunning Fog index
    gunning_fog = textstat.gunning_fog(text)

    # Get the Coleman-Liau index
    coleman_liau = textstat.coleman_liau_index(text)

    # Get the Automated Readability Index (ARI)
    ari = textstat.automated_readability_index(text)

    # Get the Dale-Chall readability score
    dale_chall = textstat.dale_chall_readability_score(text)

    # Return a dictionary with the calculated statistics
    statistics = {
        'Flesch-Kincaid Grade': fk_grade,
        'Gunning Fog': gunning_fog,
        'Coleman-Liau': coleman_liau,
        'ARI': ari,
        'Dale-Chall Readability Score': dale_chall
    }

    print(statistics)

    return statistics



for i in range(len(online)//1000):
  # Example usage:
  text_to_analyze = online['url'][i]
  result = get_text_statistics(text_to_analyze)

  # Print the results
  for stat, value in result.items():
    print(f"{stat}: {value}")


ModuleNotFoundError: No module named 'textstat'