In [1]:
import requests
from bs4 import BeautifulSoup
from urllib3.exceptions import ConnectTimeoutError,MaxRetryError,SSLError,ConnectionError,ProtocolError
from ssl import SSLCertVerificationError

def scrape_p(url):
  try:
    response = requests.get(url)
  except (ConnectTimeoutError,MaxRetryError, requests.exceptions.SSLError,requests.exceptions.ConnectionError,ProtocolError):
    return ''
  soup = BeautifulSoup(response.content,"html.parser")
  paragraphs = soup.find_all("p")
  scraped_doc = []
  for p in paragraphs:
    scraped_doc.append(p.get_text())

  return ' '.join(scraped_doc).replace('\n',' ')

In [2]:
import chromadb
chroma_client = chromadb.Client()

In [3]:
ml_collection = chroma_client.create_collection(name="machine_learning")
dl_collection = chroma_client.create_collection(name="deep_learning")
nlp_collection = chroma_client.create_collection(name="natural_language_processing")

In [4]:
# import pandas as pd
# ml_topics_set = list(pd.read_csv('dbpedia_machinelearning.csv')['nodeLabel'])
# dl_topics_set = list(pd.read_csv('dbpedia_deeplearning.csv')['nodeLabel'])
# nlp_topics_set = list(pd.read_csv('dbpedia_NLP.csv')['nodeLabel'])

In [5]:
def get_wiki_docs_collection(topics_set):
  docs = []
  wiki_url = "https://en.wikipedia.org/wiki/"
  for topic in topics_set:
    doc = scrape_p(wiki_url + str(topic).replace(' ','_'))
    docs.append(doc)
  return docs

In [6]:
from pathlib import Path

def store_wiki_docs_collection(topics_set,folder_name):
  Path(folder_name).mkdir(parents=True, exist_ok=True)
  wiki_url = "https://en.wikipedia.org/wiki/"
  for topic in topics_set:
    doc = scrape_p(wiki_url + str(topic).replace(' ','_'))
    if '/' in str(topic):
      topic = str(topic).replace('/','_')
    with open(folder_name+'/'+str(topic)+'.txt',"w",encoding="utf-8") as file:
      file.write(doc)

In [7]:
# store_wiki_docs_collection(ml_topics_set,'ml_wiki')

In [8]:
# store_wiki_docs_collection(dl_topics_set,'dl_wiki')

In [9]:
# store_wiki_docs_collection(nlp_topics_set,'nlp_wiki')

In [10]:
# base_topics_set = ['Machine learning','Deep learning','Natural language processing']

In [11]:
# store_wiki_docs_collection(base_topics_set,'base_wiki')

In [12]:
import re

def remove_reference_nums(text):
    return re.sub(r"\[\d+\]",'',text)  

In [13]:
import os

ml_dir = r'raw/ml_wiki'

def load_collection(collection,docs_directory):
    raw_docs = []
    for (root, dirs, file) in os.walk(docs_directory):
        raw_docs = file

    for raw_doc in raw_docs:
        id = ''
        doc = ''
        with open('/'.join([docs_directory,raw_doc]),"r",encoding="utf-8") as file:
            id = raw_doc.removesuffix('.txt').replace('_','/')
            doc = remove_reference_nums(file.readline())
        collection.add(documents=[doc],ids=[id])

In [14]:
ml_article = ''
with open('raw/base_wiki/Machine learning.txt',"r",encoding="utf-8") as file:
    ml_article = remove_reference_nums(file.readline())

dl_article = ''
with open('raw/base_wiki/Deep learning.txt',"r",encoding="utf-8") as file:
    dl_article = remove_reference_nums(file.readline())

nlp_article = ''
with open('raw/base_wiki/Natural language processing.txt',"r",encoding="utf-8") as file:
    nlp_article = remove_reference_nums(file.readline())

In [15]:
ml_dir = r'raw/ml_wiki'
dl_dir = r'raw/dl_wiki'
nlp_dir = r'raw/nlp_wiki'

# load_collection(ml_collection,ml_dir)
# load_collection(dl_collection,dl_dir)
# load_collection(nlp_collection,nlp_dir)

In [16]:
def store_relevant_topics(collection,query_article,num,file_name):
    results = collection.query(
        query_texts = [query_article],
        n_results = num
    )
    with open(file_name+'.txt',"w",encoding="utf-8") as file:
        file.write('\n'.join(results['ids'][0]))

In [17]:
topic_nums = [5,10,15,25,50,100,250,500,750,1000,1250,1500,1750,2000,2500]

# for num in topic_nums:
#     store_relevant_topics(ml_collection,ml_article,num,'topics/ml/ml_'+str(num))


In [18]:
topic_nums = [5,10,15,25,50,100,250,500,750,1000,1250,1500,1750,2000,2500]

# for num in topic_nums:
#     store_relevant_topics(dl_collection,dl_article,num,'topics/dl/dl_'+str(num))

In [19]:
topic_nums = [5,10,15,25,50,100,250,500,750,1000,1250,1500,1750,2000,2500]

# for num in topic_nums:
#     store_relevant_topics(nlp_collection,nlp_article,num,'topics/nlp/nlp_'+str(num))

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import json

def get_article_links(topics_file_path,output_file_path,number_of_articles):
    topics = []
    with open(topics_file_path,"r",encoding="utf-8") as file:
        topics = file.readlines()
    topics = [s.replace('\n','') for s in topics]

    driver = webdriver.Chrome()

    linkdict = dict()
    for query in topics:
        url = f'https://www.startpage.com/search?q={query}'
        driver.get(url)
        filter = ['wikipedia','youtube']
        links = []
        for link in driver.find_elements(By.CLASS_NAME,'result-link'):
            if any(substring in link.get_attribute('href') for substring in filter):
                continue
            links.append(link.get_attribute('href'))
        linkdict[query] = links[:number_of_articles]

    driver.quit()
    
    with open(output_file_path+'/'+query+'.json','w',encoding='utf-8') as file:
        file.write(json.dumps(linkdict,indent=4))

In [21]:
ml_topics_file = r'topics/ml/ml_2500.txt'
dl_topics_file = r'topics/dl/dl_2500.txt'
nlp_topics_file = r'topics/nlp/nlp_2500.txt'

# get_article_links(ml_topics_file,'aug3/ml',3)

In [22]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import time
import random

def get_article_links_gibiru(topics_file_path,output_file_path,number_of_articles):
    topics = []
    with open(topics_file_path,"r",encoding="utf-8") as file:
        topics = file.readlines()
    topics = [s.replace('\n','') for s in topics]

    driver = webdriver.Chrome()

    for query in topics:
        url = f'https://www.gibiru.com/results.html?q={query}'
        time.sleep(random.randint(10,15))
        driver.get(url)
        filter = ['wikipedia','youtube']
        links = []
        for link in driver.find_elements(By.CSS_SELECTOR,'a.gs-title'):
            if link.get_attribute('data-ctorig'):
                if any(substring in link.get_attribute('data-ctorig') for substring in filter):
                    continue
            if link.get_attribute('data-ctorig') not in links:
                links.append(link.get_attribute('data-ctorig'))
        
        mod_query = query.replace('/','_')
        with open(output_file_path+'/'+mod_query+'.json','w',encoding='utf-8') as file:
            file.write(json.dumps(links[:number_of_articles],indent=4))
        print(mod_query,':',len(links)>0)
    driver.quit()   
    

In [23]:
# get_article_links_gibiru('topics/dl/dl_5.txt','topics',5)

In [24]:
ml_topics_file = r'topics/ml/ml_2500.txt'
dl_topics_file = r'topics/dl/dl_2500.txt'
nlp_topics_file = r'topics/nlp/nlp_2500.txt'

# get_article_links_gibiru(ml_topics_file,'aug5/ml',5)
# get_article_links_gibiru(dl_topics_file,'aug5/dl',5)

In [39]:
dl_report2 = """
Convolutional neural network : True
Artificial neural networks : True
Contrastive divergence : True
Restricted Boltzmann machine : True
Kunihiko Fukushima : True
Text-to-image generation : True
Neuromorphic computing : False
Training set : True
Image classification : True
Comparison of deep learning software : True
Comparison of deep-learning software : True
Artificial intelligence : True
LSTM : True
Echo state network : False
CLARION (cognitive architecture) : True
WaveNet : True
Cognitive modelling : True
Connectomics : True
Group method of data handling : True
Russ Salakhutdinov : True
Medical algorithm : True
Alex Krizhevsky : True
Computer science : True
Computer sciences : True
ImageNet competition : True
ImageNet Large Scale Visual Recognition Challenge : True
ImageNet : True
Network architecture : True
Adaptive resonance theory : True
Boltzmann machine : True
LIDA (cognitive architecture) : True
Christopher Bishop : True
Memory : True
Synapse : True
Word-sense disambiguation : True
Decision tree learning : True
Gini impurity : True
Max Planck Institute for Biological Cybernetics : True
Biological cybernetics : True
Multiclass classification : True
CPU : True
Mental exercise : True
Computer network : True
Relationship extraction : True
Computer memory : True
Novelty detection : True
Generative model : True
Receptive field : True
Linear classifier : True
Natural language : True
Short-term memory : True
Face Recognition Grand Challenge : True
Field-programmable gate array : True
Cyborgs : True
OpenAI Five : True
Photo restoration : True
Support vector machine : True
Support-vector machine : True
Seq2seq : True
Computer engineering : True
Backpropagation through time : True
Digital circuit : True
Aging brain : True
Offline learning : True
Rprop : True
Mathematical structures : True
Binary classification : True
Primary auditory cortex : True
Image registration : True
Description logic : True
Brain development : True
FERET (facial recognition technology) : True
BCM theory : True
Edge detection : True
Parallel distributed processing : True
Speeded up robust features : True
Quantum computing : True
Optimization : True
Decision list : True
Multi-agent reinforcement learning : True
Hardware accelerator : True
Algorithms : True
Naïve algorithm : True
Terry Sejnowski : True
Microwork : True
Myelin sheath : True
Scale space : True
Health Level 7 : True
Image restoration : True
Dual space : True
Differentiable programming : True
ICD-11 : True
Web Ontology Language : False
Stochastic context-free grammar : True
Scale-invariant feature transform : True
Backdoor (computing) : True
Andrew Ng : True
Neurotrophic factor : True
Semantic web : True
Computational complexity theory : True
Floating-gate : True
Neural backpropagation : True
Language : True
Attention : True
ID3 algorithm : True
Scale-space segmentation : True
AI-complete : True
Automated reasoning : True
Ridge detection : True
Medications : True
Field extension : True
Limbic : True
Inferential programming : True
Generalization : True
Automation : True
Ben Goertzel : True
Text mining : True
Algorithmic paradigm : True
Neurodevelopmental disorder : True
Nonlinear system identification : True
Difference of Gaussians : True
Function of several real variables : True
Visual cortex : True
Algorithmic bias : True
Knowledge extraction : True
Service delivery platform : True
Overfitting : True
Cellular network : True
Gene Ontology : True
Conductive trace : True
Printed circuit board : True
Douglas Lenat : True
Intelligent personal assistant : True
Digital data : True
Cognition : True
Cognitive : True
Divide-and-conquer algorithm : True
Category (mathematics) : True
MIT Computer Science and Artificial Intelligence Laboratory : True
Guillermo Sapiro : True
Symbolic artificial intelligence : True
Mipmap : True
Overhead Imagery Research Data Set : True
Semiconductor : True
Semiconductors : True
Smoothing : True
Electroencephalography : True
Database : True
Simulated annealing : True
Analog-to-digital converter : True
Statistical machine translation : True
Cybernetics : True
Evolutionary algorithm : True
Semantic role labeling : True
Information and communications technology : True
Big data : True
Quantum algorithm : True
Digital communications : True
Vector space : True
Vector spaces : True
Mycin : True
Inverse problems : True
Question answering : True
Multiscale mathematics : True
Cognitive reserve : True
Binary Space Partition : True
Richard Mattson : True
Finite-state machine : True
Propositional logic : True
Texture synthesis : True
Knowledge base : True
Quick, Draw! : True
Confusion matrix : True
John E. Laird : True
Natural-language user interface : True
Conservation and restoration of photographs : False
Partial function : True
Formal grammar : True
Electronic component : False
Xeon : True
Deterministic algorithm : True
Homeland security : True
Boltzmann distribution : True
Natural language parsing : True
Recursion : True
Eigenstate : True
Recursion (computer science) : True
Recursive algorithm : True
Computer optimization : True
Bayesian inference in motor learning : True
Microsoft Research : True
American Association of Artificial Intelligence : True
Autonomous driving : True
Text simplification : True
Biomolecular target : True
Freebase (database) : True
Genome biology : True
Physician : True
Drug candidate : True
Edward W. Veitch : True
3Blue1Brown : True
Software patent : True
Heuristic : True
Truecasing : True
Shenlan SL03 : True
Covector : True
Wireless gateway : True
Probability theory : True
Search algorithm : True
TD-Gammon : True
Information system : True
Infrared cleaning : False
Solomonoff's theory of inductive inference : True
Single instruction, multiple threads : True
Macrophage : True
Institute for Computational Engineering and Sciences : True
Moore's law : True
Landline : True
Bayesian hierarchical modeling : True
Hebbian learning : True
CRM114 (program) : True
Face detection : True
Arithmetic : True
Resistor : True
Evolutionary programming : True
Monocyte : True
Set-top box : True
Lewy body disease : True
Wavefunction : True
Donald Knuth : True
SHRDLU : False
Proposition (logic) : True
Propositions : True
Marcus Hutter : False
Propositional function : False
National Science Foundation : False
Optimization problem : False
Combinatorial : False
String algorithms : False
Neuroplastic effects of pollution : False
TinEye : False
Image reconstruction : False
Computer hardware : False
Lebesgue integral : False
Lebesgue integration : False
Subjective logic : True
Nonmonotonic logic : True
Named entity : True
Contour line : True
CONTSYS : True
Force : True
Computational geometry : True
Physics : True
Regression analysis : True
Analog circuit : True
Analog electronics : False
Cleverbot : True
Tara Spires-Jones : True
Electrical engineering : True
Seam carving : True
Introduction to Algorithms : True
Data modeling : True
Tokenization (lexical analysis) : False
Cognitive psychology : True
Latent variable : False
Latent variables : True
Cyc : True
Frame semantics (linguistics) : True
Libratus : True
Alzheimer's disease : True
Linear programming : True
Huffman coding : True
Amazon Mechanical Turk : True
Customer value maximization : True
Li Zhaoping : True
Technological change : True
Conceptual metaphor : True
Chart parsing : True
animal : False
Donald Hebb : True
Signal processing : False
Mathematical statistics : True
Backus-Naur form : True
Psychosis : True
Conservator-restorer : True
WordNet : True
Federal Bureau of Investigation : True
System integration : True
Engineering : True
Cookbook : True
Markov chain Monte Carlo : True
Parietal lobe : True
Time series prediction : True
Time series : True
Universal (metaphysics) : True
Medicine : False
Executive functions : True
Reason : False
Reasoning : True
Transducer : True
Michael Collins (computational linguist) : True
Board game : True
Acetylcholine : True
Semiotics : True
protein : True
Proteins : True
Algorithmic topology : True
Compound term processing : True
Stack (data structure) : True
Backward chaining : True
Differential manifold : True
Execution (computing) : False
Clinical and Translational Science Award : True
Affine deformation : True
Sobolev space : True
Coarticulation : True
Propositional formula : True
State diagram : True
General Atomics MQ-9 Reaper : True
Symbolic language (programming) : True
Closed-form expression : True
Nursing home : True
FORR : True
National Academy of Engineering : True
Flowchart : True
Distributed computing : True
National Electrical Manufacturers Association : True
Data center : True
Hyperparameter : True
Massage : False
Monte Carlo method : True
Monte Carlo methods : True
Affine shape adaptation : True
Algorithmic synthesis : True
John Brennan (CIA officer) : True
NMDA receptor : True
Diode : True
Dimension : True
Transduction (machine learning) : True
Emergent behavior : True
Karnaugh map : True
Veitch diagram : True
Numerical analysis : True
Dementia with Lewy bodies : True
Algorithm analysis : True
Analysis of algorithms : True
Static random-access memory : True
Government by algorithm : False
Theano (software) : True
Relay : True
Function spaces : True
Inner product : True
Masking (Electronic Health Record) : False
Home care : True
Kleene : True
Non-deterministic algorithm : True
Dialect : True
ASC X12 : True
Inferior colliculus : True
Biomarkers of aging : True
Aerobic exercise : True
Text corpus : True
Impredicative definition : True
Base station subsystem : True
Computational mathematics : True
Amyloid plaques : True
Breadboard : True
Principal Component Analysis : True
Security alarm : True
Argument : True
Rule of inference : True
Huawei Mate 20 : True
BT 21CN : True
Blind signal separation : True
Health care : True
Bayesian search theory : True
AIXI : True
Realtek : True
Deductive reasoning : True
Wikidata : True
RFM (customer value) : True
Acrylic paint : False
Validity (logic) : False
Wireless : True
ArchNet : True
Real coordinate space : True
Binary function : True
Corner detection : True
Bayesian probability : True
Deductivism : True
Hypothetico-deductive method : True
Hypothetico-deductive model : True
Probabilistic logic : True
White House Chief of Staff : True
Computer and network surveillance : True
NIH : True
Validation therapy : True
Histopathology : True
Robustness (computer science) : True
Decision support system : True
Gliosis : True
United States Secretary of State : True
Matrix notation : True
Automatic translation : True
Conservation and restoration of cultural heritage : True
Corticobasal degeneration : True
Entropy maximization : True
Dietrich Dörner : True
Tag (Facebook) : True
Emmanuel Macron : True
Massachusetts Institute of Technology : True
MIT : True
Method of analytic tableaux : True
Module (mathematics) : True
Petal Search : True
Data privacy : True
Royal Military Academy (Belgium) : True
Lambda calculus : True
Presidency of Joe Biden : True
Peptide hormones : True
Ian Hacking : True
Formulas : True
Activity tracker : True
European Union : True
John Robert Anderson (psychologist) : True
Apoptosis : True
Association for Computing Machinery : True
Surjective : True
Entrepreneurship : True
Conjunctive normal form : True
television station : True
National Intelligence Law of the People's Republic of China : True
Iteration : False
P-vector : True
Posterior distribution : True
Endomorphism ring : True
Baidu : True
Metric space : True
Differentiable function : True
Marginal likelihood : True
John Venn : True
Overlapping subproblems : True
Square matrix : True
Eli Wallach : True
U.S. Army Research Laboratory : True
Sophia Antipolis : True
Parkinsonism : True
Denying a conjunct : True
HongMeng OS : True
Logical reasoning : True
CREB : True
Tangent space : True
Software patent debate : True
Decision theory : True
Minterms : True
Carnegie Classification of Institutions of Higher Education : True
Statistical : True
European Commission : True
Laplacian : True
Unsaturated fatty acid : True
False positives and false negatives : True
Multiply–accumulate operation : True
Section (fiber bundle) : True
Cognitive behavioral therapy : True
Scale (ratio) : True
Ontology (information science) : True
Potentiometer : True
Prim's algorithm : True
Dirichlet boundary condition : True
Factor Graphs : True
Coreference : True
Fault tree analysis : True
Integral : True
Orientation (vector space) : True
Toxicity : True
Passive optical network : False
HiSilicon : True
High blood pressure : True
Hypertension : True
Cerebral atherosclerosis : True
Formula (mathematical logic) : True
Well formed formula : True
Well-formed formula : True
Air pollution : True
Cannabinoid : True
Export Administration Regulations : True
Management science : True
William H. McRaven : True
Complex numbers : True
Closure (mathematics) : True
Malware : False
Contribution margin : False
Solder : False
Faroe Islands : False
Freshman Research Initiative : False
Logical quantifier : False
U.S. Senate : False
Local extrema : False
Local optimum : False
Minima : False
Recipe : False
University of California, Irvine : False
Point (geometry) : False
Emil Kraepelin : False
Xbox : False
Cash flows : False
Multiple sclerosis : False
Coordinates : False
Diffusion equation : False
Approximation algorithm : False
Digital video : False
Bernhard Bolzano : False
Locally : False
Implicit function theorem : False
HIV_AIDS : False
Orthogonal basis : False
Ordered pair : False
Integer programming : False
Chinese language : False
Relevance : False
Neta S : False
Universal property : False
Structured program theorem : False
Halite AI Programming Competition : False
Stock market prediction : False
Vital signs : False
PI3K_AKT_mTOR pathway : False
Binomial distribution : False
Row and column vectors : False
Commutative law : False
Commutative : False
Commutativity : False
NCAA Division I FBS : False
Constraint programming : False
Axiom system : False
Postdoctoral research : False
Memantine : False
Allen Newell : False
Mean : False
Cognitive linguistics : False
Babylonian astronomy : False
Olive oil : False
Beta cells : False
Electricity : False
Paul C. Rosenbloom : False
Belief : False
Comma-separated values : False
Sentence breaking : False
ISO TC 215 : False
Calcium in biology : False
Gothic Revival architecture : False
Radiotracer : False
Google Fuchsia : False
Clock : False
Timelike : False
Turkish language : False
American English : False
A. K. Dewdney : False"""

In [None]:
# with open('aug5/dl_report2.csv','w',encoding='utf-8') as file:
#     file.write(dl_report2.replace(':',','))

In [None]:
# import fileinput
# import sys

# for line in fileinput.input('aug5/dl_report2.csv', inplace=True):
#     sys.stdout.write('"{l}'.format(l=line))

In [42]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import time
import random

def get_missing_article_links_gibiru(missing_topics_list,output_file_path,number_of_articles):
    topics = missing_topics_list

    driver = webdriver.Chrome()

    for query in topics:
        url = f'https://www.gibiru.com/results.html?q={query}'
        time.sleep(random.randint(10,15))
        driver.get(url)
        filter = ['wikipedia','youtube']
        links = []
        for link in driver.find_elements(By.CSS_SELECTOR,'a.gs-title'):
            if link.get_attribute('data-ctorig'):
                if any(substring in link.get_attribute('data-ctorig') for substring in filter):
                    continue
            if link.get_attribute('data-ctorig') not in links:
                links.append(link.get_attribute('data-ctorig'))
        
        mod_query = query.replace('/','_')
        with open(output_file_path+'/'+mod_query+'.json','w',encoding='utf-8') as file:
            file.write(json.dumps(links[:number_of_articles],indent=4))
        print(mod_query,':',len(links)>0)
    driver.quit()
    
    

In [43]:
import pandas as pd
df1 = pd.read_csv('aug5/dl_report2.csv')
dl_missing1 = list(df1[df1['Successful']==False]['Topic'])

get_missing_article_links_gibiru(dl_missing1,'aug5/dl',5)

Neuromorphic computing : True
Echo state network : False
Web Ontology Language : False
Conservation and restoration of photographs : True
Electronic component : True
Infrared cleaning : True
SHRDLU : True
Marcus Hutter : True
Propositional function : True
National Science Foundation : True
Optimization problem : True
Combinatorial : True
String algorithms : True
Neuroplastic effects of pollution : True
TinEye : True
Image reconstruction : True
Computer hardware : True
Lebesgue integral : True
Lebesgue integration : True
Analog electronics : True
Tokenization (lexical analysis) : True
Latent variable : True
animal : True
Signal processing : True
Medicine : True
Reason : True
Execution (computing) : True
Massage : True
Government by algorithm : True
Masking (Electronic Health Record) : True
Acrylic paint : True
Validity (logic) : True
Iteration : True
Passive optical network : True
Malware : True
Contribution margin : True
Solder : True
Faroe Islands : True
Freshman Research Initiative :

In [29]:
import pandas as pd
df = pd.read_csv('aug5/ml_report1.csv')
ml_missing1 = list(df[df['Successful']==False]['Topic'])
with open('topics/ml/ml_2500.txt','r') as file:
    l = file.readlines()
l = [s.replace('\n','') for s in l]
messed_up = []
for missing in ml_missing1:
    if missing not in l:
        messed_up.append(missing)
# print(messed_up)
# print(l[1930])

# get_missing_article_links_gibiru(ml_missing1,'aug5/ml',5)

In [30]:
# with open('aug5/ml_report2.csv','w',encoding='utf-8') as file:
#     file.write(ml_report2.replace(':',','))

In [31]:
# import fileinput
# import sys

# for line in fileinput.input('aug5/ml_report2.csv', inplace=True):
#     sys.stdout.write('"{l}'.format(l=line))

In [32]:
import pandas as pd
df2 = pd.read_csv('aug5/ml_report2.csv')
ml_missing2 = list(df2[df2['Successful']==False]['Topic'])

# get_missing_article_links_gibiru(ml_missing2,'aug5/ml',5)

In [33]:
# with open('aug5/ml_report3.csv','w',encoding='utf-8') as file:
#     file.write(ml_report3.replace(':',','))

In [34]:
# import fileinput
# import sys

# for line in fileinput.input('aug5/ml_report3.csv', inplace=True):
#     sys.stdout.write('"{l}'.format(l=line))

In [35]:
import pandas as pd
df3 = pd.read_csv('aug5/ml_report3.csv')
ml_missing3 = list(df3[df3['Successful']==False]['Topic'])

# get_missing_article_links_gibiru(ml_missing3,'aug5/ml',5)

In [None]:
import os
import json

def get_articles(docs_directory,output_directory):
    raw_docs = []
    for (root, dirs, file) in os.walk(docs_directory):
        raw_docs = file
    
    for doc in raw_docs:
        links = []
        with open(docs_directory+'/'+doc,'r',encoding='utf-8') as file:
            links = json.load(file)
        
        for i,link in enumerate(links):
            article = scrape_p(link)
            fname = doc.removesuffix('.json')+ ' '+str(i)
            # article = f'[Topic:{doc.removesuffix('.json')}, Link:{link}]'+article
            
            with open(output_directory+'/'+fname+'.txt','w',encoding='utf-8') as file:
                file.write(article)
            if i < 4:
                print(fname + ' saved 👆')
            else:
                print(fname+' saved 👌')

In [37]:
# get_articles('aug5/ml','aug5_docs/ml')