pip3 install scrapy
pip3 install bs4
pip3 install lxml
pip3 install pyspark
create new topic in kafka:
bin/kafka-topics.sh --create --partitions 1 --replication-factor 1 --topic x_news_1 --bootstrap-server localhost:9092
cd
to consumer
folder and run consumer kafka in package:
java -jar target/consumer-V1-jar-with-dependencies.jar
cd
to crawler
folder and run command:
scrapy crawl news