/
scrapeController.go
143 lines (116 loc) · 3.84 KB
/
scrapeController.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package controllers
import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"github.com/gin-gonic/gin"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"github.com/xilaluna/fentanyl-epidemic-tracker/configs"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
var websiteCollection *mongo.Collection = configs.DatabaseCollection(configs.GetClient(), "websites")
func ScrapeController(c *gin.Context) {
var num int
var onPageOne bool
var resultNumber bson.M
numberPageFilter := bson.M{"website": "darknetlive.com"}
err := websiteCollection.FindOne(context.Background(), numberPageFilter).Decode(&resultNumber)
if err != nil {
num = 1
} else {
// convert bson to int
num = int(resultNumber["paginationNum"].(int32))
}
// instantiate default collector and set random User-Agent
collector := colly.NewCollector(
colly.AllowedDomains("darknetlive.com"),
colly.CacheDir("./.cache"),
)
extensions.RandomUserAgent(collector)
// init article collector and set random User-Agent
articleCollector := collector.Clone()
extensions.RandomUserAgent(articleCollector)
// find articles links and vist them with articleCollector
collector.OnHTML("main > section > article > a", func(content *colly.HTMLElement) {
link := content.Request.AbsoluteURL(content.Attr("href"))
// Check if link is already in database
var result bson.M
filter := bson.M{"link": link}
err := articlesCollection.FindOne(context.Background(), filter).Decode(&result)
if err == nil {
return
}
articleCollector.Visit(link)
})
// find pagination number
collector.OnHTML("main > nav > ul > li:last-child > a", func(content *colly.HTMLElement) {
if onPageOne {
re := regexp.MustCompile("[0-9]+")
stringNumber, _ := strconv.Atoi(re.FindAllString(content.Attr("href"), -1)[0])
if stringNumber > num {
// Update pagination number
update := bson.M{"$set": bson.M{"website": "darknetlive.com", "paginationNum": stringNumber}}
opts := options.Update().SetUpsert(true)
_, err := websiteCollection.UpdateOne(context.Background(), numberPageFilter, update, opts)
if err != nil {
fmt.Println(err)
}
onPageOne = false
num = stringNumber
} else {
onPageOne = false
num = 1
return
}
} else {
return
}
})
articleCollector.OnHTML("main > article", func(content *colly.HTMLElement) {
title := content.ChildText("header > h1")
date := content.ChildText("aside > div > time")
link := content.Request.URL.String()
found := false
// loop through all the paragraphs
content.ForEachWithBreak("div > p", func(i int, paragraph *colly.HTMLElement) bool {
// check paragraph for the word "fentanyl"
if strings.Contains(strings.ToLower(paragraph.Text), "fentanyl") {
fmt.Println("Found article:", title, date, link)
// Insert article into MongoDB
document := bson.D{{Key: "link", Value: link}, {Key: "title", Value: title}, {Key: "date", Value: date}, {Key: "datapoint", Value: true}}
articlesCollection.InsertOne(context.Background(), document)
found = true
// stop the loop
return false
}
return true
})
if !found {
document := bson.D{{Key: "link", Value: link}, {Key: "title", Value: title}, {Key: "date", Value: date}, {Key: "datapoint", Value: false}}
articlesCollection.InsertOne(context.Background(), document)
}
})
articleCollector.OnRequest(func(request *colly.Request) {
fmt.Println("Visiting", request.URL)
})
collector.OnRequest(func(request *colly.Request) {
fmt.Println("Visiting", request.URL)
})
// start scraping
for i := 1; i <= num; i++ {
if i == 1 {
onPageOne = true
collector.Visit("https://darknetlive.com/post")
} else {
collector.Visit("https://darknetlive.com/post/page/" + strconv.Itoa(i))
}
}
c.JSON(200, gin.H{
"message": "scraped",
})
}