<a href="https://colab.research.google.com/github/zuba1rkhan/NLP/blob/main/NLP_Medical_%26_Non_Medical__Articals_in_Wikipedia_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install wikipedia
!pip install requests

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=83371e66d1152419f9ae48471c97c714ef2742254078b5885e0318677cfd89af
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [3]:
import wikipedia
import nltk
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer as stemmer
from nltk import FreqDist
from nltk.classify import apply_features
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [4]:
def remove_sp_char(text):
  remove_text = re.sub(r'[^a-zA-Z\s]', '', text)
  return remove_text

In [5]:
def convert_low_case(text):
  low_case_text = text.lower()
  return low_case_text

In [6]:
def stem_tokenize_sp_case(text):
  tokens_word = word_tokenize(text)
  stem_text = ' '.join([stemmer.stem(word)for word in tokens_word])
  return stem_text


In [7]:
def remove_stop_words(text):
  token_word = word_tokenize(text)
  stop_text = set(stopwords.words('english'))
  cleaned_text = ' '.join([word for word in token_word if word.lower() not in stop_text])
  return cleaned_text

In [8]:
def remove_html_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

In [9]:
def lemmatize_text(text):
  lemmatizer = WordNetLemmatizer()
  token_text = nltk.word_tokenize(text)
  lemmatized_text = ' '.join(lemmatizer.lemmatize(word) for word in token_text)
  return lemmatized_text

In [10]:
def stem_text(text):
  stemmed = SnowballStemmer('english')
  token_text = nltk.word_tokenize(text)
  stemmed_text = ' '.join(stemmed.stem(word) for word in token_text)
  return stemmed_text

In [11]:
def preprocess_text(text):
  filtered_text = remove_sp_char(text)
  filtered_text = convert_low_case(text)
  #filtered_text = stem_tokenize_sp_case(text)
  filtered_text = lemmatize_text(text)
  filtered_text = stem_text(text)
  filtered_text = remove_stop_words(text)
  filtered_text = remove_html_text(text)
  return filtered_text

In [12]:
import requests

def get_wikipedia_articles_by_category(category, language = 'en', num_results=10000):
  base_url = 'https://en.wikipedia.org/w/api.php'.format(language)
  parameters = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': 'Category:' + category,
        'cmlimit': num_results
  }
  response = requests.get(base_url, parameters)
  data = response.json()

  if 'query' in data and 'categorymembers' in data['query']:
    articles = [entry['title'] for entry in data['query']['categorymembers']]
    return articles
  else:
    return None

In [14]:
medical = ['medicine', 'medicine', 'doctor', 'nurse']
non_medical = ['history', 'Engineering', 'computer science', 'geography']
medical_articles = []
non_medical_articles = []
for i in range(len(medical)):
  medical_category = get_wikipedia_articles_by_category(medical[i],)
  medical_articles.append(medical_category)
medical_articles = [article for articles in medical_articles for article in articles]
print("Medical articles: ", medical_articles)
for j in range(len(non_medical)):
  non_medical_category = get_wikipedia_articles_by_category(non_medical[j],)
  non_medical_articles.append(non_medical_category)

non_medical_articles = [article for articles in non_medical_articles for article in articles]

print('\nNon Medical Articles: ', non_medical_articles)

labeled_data = [
    {'text': article, 'label': 1} for article in medical_articles
] + [
    {'text': article, 'label': 0} for article in non_medical_articles
]
import pandas as pd
article_df = pd.DataFrame(labeled_data)

Medical articles:  ['Medicine', 'Outline of medicine', 'Portal:Medicine', 'Category:Medicine by city', 'Category:Medicine by country', 'Category:Medicine by country subdivision', 'Category:Medicine by dependent territory', 'Category:Medicine by century', 'Category:Medicine by decade', 'Category:Medicine by year', 'Category:Medical lists', 'Category:Alternative medicine', 'Category:Veterinary medicine', 'Category:Clinical medicine', 'Category:Medical activism', 'Category:Medical associations', 'Category:Cause (medicine)', 'Category:Chemicals in medicine', 'Category:Medical-related conspiracy theories', 'Category:Medical culture', 'Category:Medicine deities', 'Category:Medical education', 'Category:History of medicine', 'Category:Health insurance', 'Category:Intersex and medicine', 'Category:Intersex variations', 'Category:Medical aspects of death', 'Category:Medical diplomacy', 'Category:Medical families', 'Category:Medical humanities', 'Category:Medical monitoring', 'Category:Medical p

In [15]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
filtered_text = [preprocess_text(doc) for doc in article_df['text']]

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(filtered_text)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow_matrix.toarray())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Vocabulary: ['11' 'about' 'activism' 'aerospike' 'aging' 'agnostic' 'algebra'
 'alternative' 'and' 'area' 'areas' 'aspects' 'asset' 'assisted'
 'associations' 'attacks' 'awards' 'bajrangarh' 'bifiltration' 'biography'
 'boolean' 'bug' 'by' 'care' 'category' 'catering' 'cause' 'century'
 'charenton' 'charmant' 'chemical' 'chemicals' 'choice' 'chronology'
 'citizenship' 'city' 'civilization' 'classifications' 'clinical'
 'cluster' 'code' 'competitions' 'compressed' 'computation'
 'computational' 'computer' 'concepts' 'conferences' 'conspiracy'
 'construction' 'consulting' 'contact' 'contents' 'contiguity'
 'controversies' 'coordinates' 'counter' 'countries' 'country' 'cover'
 'critical' 'culture' 'curve' 'data' 'date' 'day' 'death' 'decade' 'decay'
 'deities' 'density' 'dependent' 'deserts' 'design' 'detector' 'developer'
 'developing' 'development' 'diplomacy' 'disciplines' 'distance'
 'distribution' 'division' 'dynasty' 'earth' 'easting' 'economic'
 'economy' 'edgelands' 'education' 'e

  soup = BeautifulSoup(text, 'html.parser')


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

labels = article_df['label']

X_train, X_test, y_train, y_test = train_test_split(bow_matrix, labels, test_size = 0.2, random_state = 37)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

nb_predict = nb_classifier.predict(X_test)

lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

lr_predict = lr_classifier.predict(X_test)

print('Naive Bayes Accuracy: ', accuracy_score(y_test, nb_predict))
print('Naive Bayes Classification:\n', classification_report(y_test, nb_predict))

print('\nLogistic Regression Accuracy:', accuracy_score(y_test, lr_predict))
print('Logistion Regression Classification:\n', classification_report(y_test, lr_predict))

Naive Bayes Accuracy:  0.9384615384615385
Naive Bayes Classification:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        51
           1       0.78      1.00      0.88        14

    accuracy                           0.94        65
   macro avg       0.89      0.96      0.92        65
weighted avg       0.95      0.94      0.94        65


Logistic Regression Accuracy: 0.9846153846153847
Logistion Regression Classification:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       1.00      0.93      0.96        14

    accuracy                           0.98        65
   macro avg       0.99      0.96      0.98        65
weighted avg       0.98      0.98      0.98        65



In [17]:
print("Actual Labels:", y_test)
print(bow_matrix)
print("Naive Bayes Predictions:", nb_predict)
print("Logistic Regression Predictions:", lr_predict)

Actual Labels: 185    0
286    0
229    0
131    0
218    0
      ..
198    0
32     1
130    0
259    0
275    0
Name: label, Length: 65, dtype: int64
  (0, 185)	1
  (1, 185)	1
  (1, 215)	1
  (1, 208)	1
  (2, 185)	1
  (2, 230)	1
  (3, 185)	1
  (3, 24)	1
  (3, 22)	1
  (3, 35)	1
  (4, 185)	1
  (4, 24)	1
  (4, 22)	1
  (4, 58)	1
  (5, 185)	1
  (5, 24)	1
  (5, 22)	1
  (5, 58)	1
  (5, 285)	1
  (6, 185)	1
  (6, 24)	1
  (6, 22)	1
  (6, 71)	1
  (6, 299)	1
  (7, 185)	1
  :	:
  (315, 123)	1
  (316, 24)	1
  (316, 305)	1
  (316, 330)	1
  (317, 208)	1
  (317, 24)	1
  (317, 122)	1
  (317, 312)	1
  (317, 82)	1
  (318, 24)	1
  (318, 327)	1
  (318, 1)	1
  (318, 123)	1
  (319, 24)	1
  (319, 328)	1
  (319, 182)	1
  (320, 24)	1
  (320, 122)	1
  (320, 330)	1
  (321, 24)	1
  (321, 147)	1
  (321, 121)	1
  (322, 24)	1
  (322, 283)	1
  (322, 123)	1
Naive Bayes Predictions: [0 0 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
Logist

In [19]:
!git clone https://github.com/zuba1rkhan/NLP.git


Cloning into 'NLP'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (3/3), done.
