# Cài đặt PySpark

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

import findspark
findspark.init()

# Spark Context

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkContext('local')
sqlc = SQLContext(spark)

# Yêu cầu 1

In [None]:
!pip install validators

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
Building wheels for collected packages: validators
  Building wheel for validators (setup.py) ... [?25l[?25hdone
  Created wheel for validators: filename=validators-0.20.0-py3-none-any.whl size=19581 sha256=fc6f877b0320cdf44b7d90c38d0742bb7f3aa75c822570152ee2f768f68e258d
  Stored in directory: /root/.cache/pip/wheels/5f/55/ab/36a76989f7f88d9ca7b1f68da6d94252bb6a8d6ad4f18e04e9
Successfully built validators
Installing collected packages: validators
Successfully installed validators-0.20.0


In [None]:
import requests
from bs4 import BeautifulSoup
import validators
import time

In [None]:
def getAllLink(url):
  # print(url)
  html = requests.get(url)
  soup = BeautifulSoup(html.text)
  html.close()
  # lấy link trong href của tất cả thẻ <a>
  allLink = [i.get('href') if (i.get('href') != None) 
    else '' for i in soup.findAll('a')]
  return allLink


In [None]:
# lấy danh sách liên kết trong url có domain hợp lệ
def allLinkValid(url, domain):
  allLink = getAllLink(url)

  # không có links trong url
  if not allLink:
    return

  # lấy những link có cùng domain
  # i.split('//')[-1].split('/')[0] là domain link i
  links = [i if (domain in i.split('//')[-1].split('/')[0]) 
    else '' for i in allLink]
  
  # lấy link hợp lệ bằng validators
  links = [i if validators.url(i) == True
    else '' for i in links]
  links = list(set(links))

  if '' in links:
      links.remove('')
    
  return links  

In [None]:
def crawlUrl(url, prefix, length):
  columns = StructType([StructField('Page',
                                    StringType(), True),
                      StructField('Successor',
                                  StringType(), True)])
  LinkStructType = StructType([StructField('link', StringType(), True)])
  
  
  # Create an empty dataframe with empty schema
  dfLinks = sqlc.createDataFrame(data = [], schema = columns)

  domain = prefix
  newLinks = sqlc.createDataFrame([{'link':url, 'index': 0}])

  while dfLinks.count() < length:
    # Danh sách link cột success chưa crawl
    BLinks = sqlc.createDataFrame(data = [], schema = LinkStructType)

    for i in range(newLinks.count()):
      url = newLinks.where(newLinks['index'] == i).collect()[0].link
      linktamp = allLinkValid(url, domain)

      # không có out-links trong html hoặc rỗng
      if linktamp is None or not linktamp:
        df = sqlc.createDataFrame([{'Page': url, 'Successor': ''}])
        dfLinks = dfLinks.union(df)
      else:
        Links = [{'Page': url, 'Successor': i} for i in linktamp]
        df = sqlc.createDataFrame(Links)
        dfLinks = dfLinks.union(df)

        # thêm list linktamp vào BLinks
        BLinks = BLinks.union(sqlc.createDataFrame([{'link': i} for i in linktamp]))

      # thoát ra nếu vượt giới hạn số lượng link
      if dfLinks.count() >= length:
        break

    # thêm cột index
    newLinks = BLinks.coalesce(1).withColumn("index", monotonically_increasing_id())
  return dfLinks.orderBy("Page")

In [None]:
start_time = time.time()
dfLinks = crawlUrl('https://tdtu.edu.vn', 'tdtu.edu.vn', 1000)
print(time.time() - start_time)

318.2294223308563


In [None]:
dfLinks.show(1000,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Page                                                                                                                                                                  |Successor                                                                                                                                                             |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
dfLinks.count()

1033

In [None]:
!rm -rf LinkData/
dfLinks.coalesce(1).write.csv(path='LinkData', 
             header=True, 
             sep=',')
# dfLinks.rdd.saveAsTextFile('LinkData')

In [None]:
!zip -r /content/LinkData.zip /content/LinkData

  adding: content/LinkData/ (stored 0%)
  adding: content/LinkData/_SUCCESS (stored 0%)
  adding: content/LinkData/.part-00000-d45fa238-2f53-4a2f-a4cf-7c7bca9757b4-c000.csv.crc (stored 0%)
  adding: content/LinkData/part-00000-d45fa238-2f53-4a2f-a4cf-7c7bca9757b4-c000.csv (deflated 90%)
  adding: content/LinkData/._SUCCESS.crc (stored 0%)


# Yêu cầu 2


In [None]:
import numpy as np

In [None]:
!unzip LinkData.zip

Archive:  LinkData.zip
   creating: content/LinkData/
 extracting: content/LinkData/_SUCCESS  
 extracting: content/LinkData/.part-00000-d45fa238-2f53-4a2f-a4cf-7c7bca9757b4-c000.csv.crc  
  inflating: content/LinkData/part-00000-d45fa238-2f53-4a2f-a4cf-7c7bca9757b4-c000.csv  
 extracting: content/LinkData/._SUCCESS.crc  


In [None]:
if os.path.exists('/content/content/LinkData'):
  dfLinks = sqlc.read.csv('/content/content/LinkData',
                   header=False, sep=',')

In [None]:
dfLinks.show()

+--------------------+--------------------+
|                 _c0|                 _c1|
+--------------------+--------------------+
|                Page|           Successor|
|http://aaf.tdtu.e...|http://www.tdtu.e...|
|http://aimas.tdtu...|  http://tdtu.edu.vn|
|http://aimas.tdtu...|http://www.tdtu.e...|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://en.cait.td...|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....|http://cait.tdtu....|
|http://cait.tdtu....| http://td

In [None]:
# Đánh dead-ends mỗi trang là 1 nếu có
df1 = dfLinks.withColumn('Dead-ends', when(dfLinks["Successor"].isNull(), 1).otherwise(0))
df1 = df1.select('Page', 'Dead-ends')
# thống kê out-links mỗi page
df2 = df1.groupBy('Page', 'Dead-ends').count().withColumnRenamed('count', 'Out-degree').orderBy("Page")


In [None]:
df1.show()

+--------------------+---------+
|                Page|Dead-ends|
+--------------------+---------+
|http://aaf.tdtu.e...|        0|
|http://aimas.tdtu...|        0|
|http://aimas.tdtu...|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
|http://cait.tdtu....|        0|
+--------------------+---------+
only showing top 20 rows



In [None]:
df2.show()

+--------------------+---------+----------+
|                Page|Dead-ends|Out-degree|
+--------------------+---------+----------+
|http://aaf.tdtu.e...|        0|         1|
|http://aimas.tdtu...|        0|         2|
|http://cait.tdtu....|        0|        34|
|http://caodang.td...|        0|         6|
|http://caodang.td...|        0|         6|
|http://civil.tdtu...|        0|         1|
|http://ecc.tdtu.e...|        0|         2|
|http://enlabsafe....|        0|         1|
|http://fba.tdtu.e...|        0|         1|
|http://feee.tdtu....|        0|         2|
|http://ffl.tdtu.e...|        0|         8|
|http://finance.td...|        0|         1|
|http://fms.tdtu.e...|        0|        10|
|http://fss.tdtu.e...|        0|       178|
|http://grad.tdtu....|        0|         6|
|http://idiscovery...|        1|         1|
|http://idiscovery...|        0|         6|
|http://idiscovery...|        0|         6|
|http://idiscovery...|        0|         6|
|http://ifa.tdtu.e...|        0|

In [None]:
columns = StructType([StructField(str(i), FloatType(), True) for i in range(df2.count())])
dfM = sqlc.createDataFrame(data = [], schema = columns)
dfPage = df2.coalesce(1).withColumn("index", monotonically_increasing_id())
# tạo ma trận 90*90
M = np.full((df2.count(), df2.count()), 0.0)

for i in range(dfPage.count()):
  # lấy từng link trong dfPage theo index
  page = dfPage.where(dfPage.index == i).collect()[0].Page

  # lấy các link cột Successor có link là page
  a = dfLinks.where(dfLinks.Page == page).select('Successor')

  # lấy row trong dfPage nếu link Successor có trong danh sách Page
  dfIndex = dfPage.join(a, dfPage.Page == a.Successor)
  
  # lấy thứ tự index trong dfIndex
  indexList = [i[0] for i in dfIndex.select('index').collect()]
  # data = [[1/(len(indexList)) if i in indexList else 0.0 for i in range(df2.count())]]

  # đánh trọng số có cột nằm trong indexList
  for j in range(len(indexList)):
    M[i][indexList[j]] = 1/len(indexList)

In [None]:
# cho 1/90 với dòng toàn 0.0
b = M
def check(m):
  for i in m:
    if i > 0:
      return True
  return False
for i in range(len(b)):
  if not check(b[i]):
    b[i] = [1/len(b)] * len(b)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
M = np.transpose(b)
print(M)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.01111111 0.01111111 0.01111111
  0.01111111 0.         0.         0.         0.         0.
  0.         0.         0.01111111 0.         0.         0.
  0.         0.         0.         0.01111111 0.01111111 0.05263158
  0.00246914 0.01111111 0.         0.         0.         0.
  0.         0.         0.         0.         0.01111111 0.
  0.         0.01111111 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.01111111 0.         0.01111111 0.         0.
  0.         0.01111111 0.         0.01111111 0.         0.
  0.         0.         0.         0.01111111 0.02222222 0.01111111
  0.         0.         0.         0.01111111 0.01111111 0.01111111]
 [0.         0.         0.         0.         0.         0.
  0.   

In [None]:
r = np.full((len(M),1), 1/len(M))

def calPageRank(r, M, e):
  sum = [1, 0]
  rOld = r
  rNew = M.dot(rOld)

  while np.abs(sum[0] - sum[1]) > e:
    sum = [0, 0]
    
    rOld = rNew
    rNew = M.dot(rOld)
    for i in range(len(rOld)):
      sum[0] += rOld[i][0] - rNew[i][0]

    rOld = rNew
    rNew = M.dot(rOld)
    for i in range(len(rOld)):
      sum[1] += rOld[i][0] - rNew[i][0]
    
    print(sum[0], sum[1], np.abs(sum[0] - sum[1]))
  return rNew
res = calPageRank(r, M ,0.0001)

0.4042520917642749 0.16888604675119687 0.23536604501307803
0.12075193436563869 0.06802250553699359 0.052729428828645106
0.04221929808526001 0.025373383505209814 0.016845914580050195
0.015542437277289855 0.009472795880350654 0.006069641396939201
0.005798434819970415 0.0035486866916350167 0.0022497481283353985
0.0021746940744275375 0.0013332425230204695 0.000841451551407068
0.000817842676822825 0.0005018545728333307 0.0003159881039894943
0.0003080485608385063 0.00018912792348165942 0.00011892063735684686
0.00011613654632293676 7.132472876943734e-05 4.4811817553499415e-05


In [None]:
columns = StructType([StructField('PageRank', FloatType(), True)])

data = res.tolist()
dfTamp = sqlc.createDataFrame(data = data, schema = columns)
dfTamp = dfTamp.coalesce(1).withColumn("index", monotonically_increasing_id())
df = df2.coalesce(1).withColumn("index", monotonically_increasing_id())
df = df.join(dfTamp, df.index == dfTamp.index,"inner").select('Page', 'Out-degree', 'Dead-ends', 'PageRank')
df.show(100)

+--------------------+----------+---------+------------+
|                Page|Out-degree|Dead-ends|    PageRank|
+--------------------+----------+---------+------------+
|http://aaf.tdtu.e...|         1|        0| 5.513993E-7|
|http://aimas.tdtu...|         2|        0|4.9747047E-7|
|http://cait.tdtu....|        34|        0| 2.661274E-6|
|http://caodang.td...|         6|        0|8.4344714E-7|
|http://caodang.td...|         6|        0|8.4344714E-7|
|http://civil.tdtu...|         1|        0| 5.513993E-7|
|http://ecc.tdtu.e...|         2|        0| 6.367774E-6|
|http://enlabsafe....|         1|        0| 5.513993E-7|
|http://fba.tdtu.e...|         1|        0| 5.513993E-7|
|http://feee.tdtu....|         2|        0| 5.513993E-7|
|http://ffl.tdtu.e...|         8|        0|   6.9231E-7|
|http://finance.td...|         1|        0| 5.513993E-7|
|http://fms.tdtu.e...|        10|        0|2.7608637E-6|
|http://fss.tdtu.e...|       178|        0| 5.513993E-7|
|http://grad.tdtu....|         

In [None]:
!rm -rf PageRankData/
df.coalesce(1).write.csv(path='PageRankData', 
             header=True, 
             sep=',')