# COVID-19 Journal Classification using Machine Learning Techniques<br>
## Team Members:<br>
1.Yerramaddu Jahnavi - 181CO260 <br>
2.Swathi J S - 181CO155

The main aim of our system is to classify the abstracts related to covid with their respective journals so that a researcher can refer to articles of his interest from the required journals instead of searching all the articles.

### Importing the necessary libraries 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import future

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import label_binarize

from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier


from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

### Data Description | Loading the dataset 

In [None]:
data = pd.read_csv("metadata.csv")

In [None]:
data.shape

In [None]:
data.head(10)

In [None]:
data['source_x'].value_counts()

In [None]:
data.dtypes

In [None]:
data['journal'].value_counts()

In [None]:
filt = (data['journal'] =='bioRxiv') | (data['journal'] == 'PLoS One') | (data['journal'] == 'BMJ')
updated_df = data.loc[filt,['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id']]
updated_df.to_csv('./data3.csv', index = False)


In [None]:
data_filtered=pd.read_csv("data3.csv")

In [None]:
data_filtered.head()

In [None]:
data_filtered['journal'].value_counts()

In [None]:
data_filtered.shape

In [None]:
data_filtered.dtypes

### Language Detection Module

In [None]:
from langdetect import detect
language = []
for i in data_filtered['title']:
    language.append(detect(i))
data_filtered['Title Language']=language

In [None]:
data_filtered.head()

In [None]:
data_filtered['Title Language'].value_counts()

In [None]:
data_filtered['abstract'] = data_filtered['abstract'].apply(str)

In [None]:
language = []
for i in data_filtered['abstract']:
    language.append(detect(i))
data_filtered['Abstract Language']=language

In [None]:
data_filtered['Abstract Language'].value_counts()

In [None]:
filt = (data_filtered['Abstract Language'] =='en')
updated_df = data_filtered.loc[filt,data_filtered.columns]
updated_df.to_csv('./data4.csv', index = False)

In [None]:
english_data_filtered = pd.read_csv("data4.csv")

In [None]:
english_data_filtered.head()

In [None]:
english_data_filtered['Title Language'].value_counts()

### Named Recognisation Entity

In [None]:
import spacy
from spacy import displacy
from collections import Counter
ner = spacy.load("en_core_web_lg")

In [None]:
## tag text
txt = english_data_filtered["abstract"].iloc[1]
doc = ner(txt)
## display result
spacy.displacy.render(doc, style="ent")

In [None]:
import collections

In [None]:
## tag text and exctract tags into a list
english_data_filtered["tags"] = english_data_filtered["abstract"].apply(lambda x: [(tag.text, tag.label_) for tag in ner(x).ents] )
## utils function to count the element of a list
def utils_lst_count(lst):
    dic_counter = collections.Counter()
    for x in lst:
        dic_counter[x] += 1
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    lst_count = [ {key:value} for key,value in dic_counter.items() ]
    return lst_count

## count tags
english_data_filtered["tags"] = english_data_filtered["tags"].apply(lambda x: utils_lst_count(x))

## utils function create new column for each tag category
def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

## extract features
tags_set = []
for lst in english_data_filtered["tags"].tolist():
     for dic in lst:
            for k in dic.keys():
                tags_set.append(k[1])
tags_set = list(set(tags_set))
for feature in tags_set:
    english_data_filtered["tags_"+feature] = english_data_filtered["tags"].apply(lambda x: utils_ner_features(x, feature))

## print result
english_data_filtered.head()

In [None]:
english_data_filtered['tags_CARDINAL']

### Data Cleaning 

In [None]:
from nltk.stem.snowball import SnowballStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
def CleanText(raw_text, remove_stopwords=True, stemming= False, flg_lemm=True, split_text=True):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case 
    
    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # stemming
        #stemmer = PorterStemmer()
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        words = [lem.lemmatize(word) for word in words]
    
    return(words)

In [None]:
cleaned_abstract=[]
for i in english_data_filtered['abstract']:
    cleaned_abstract.append(CleanText(i))
english_data_filtered['Cleaned Abstract']=cleaned_abstract
english_data_filtered.head()

In [None]:
english_data_filtered['Cleaned Abstract']

In [None]:
sen = []
for i in english_data_filtered['Cleaned Abstract']:
    sentence = " "
    new = " "
    for j in i:
        sentence += (j + new)
    sen.append(sentence)
english_data_filtered['Cleaned Abstract']=sen

In [None]:
english_data_filtered.head()

In [None]:
#PLoS One - Most Frequent Tags in "PLoS One" Journal Abstract
import seaborn as sns
tags_list = english_data_filtered[english_data_filtered["journal"]=="PLoS One"]["tags"].sum()
map_lst = list(map(lambda x: list(x.keys())[0], tags_list))
english_data_filtered_tags = pd.DataFrame(map_lst, columns=['tag','type'])
english_data_filtered_tags["count"] = 1
english_data_filtered_tags = english_data_filtered_tags.groupby(['type',  
                'tag']).count().reset_index().sort_values("count", 
                 ascending=False)
fig, ax = plt.subplots()
fig.suptitle("Top frequent tags", fontsize=12)
sns.barplot(x="count", y="tag", hue="type", 
            data=english_data_filtered_tags.iloc[:10,:], dodge=False, ax=ax)
ax.grid(axis="x")
plt.show()

In [None]:
#bioRxiv - Most Frequent Tags in "bioRxiv" Journal Abstract
import seaborn as sns
tags_list = english_data_filtered[english_data_filtered["journal"]=="bioRxiv"]["tags"].sum()
map_lst = list(map(lambda x: list(x.keys())[0], tags_list))
english_data_filtered_tags = pd.DataFrame(map_lst, columns=['tag','type'])
english_data_filtered_tags["count"] = 1
english_data_filtered_tags = english_data_filtered_tags.groupby(['type',  
                'tag']).count().reset_index().sort_values("count", 
                 ascending=False)
fig, ax = plt.subplots()
fig.suptitle("Top frequent tags", fontsize=12)
sns.barplot(x="count", y="tag", hue="type", 
            data=english_data_filtered_tags.iloc[:10,:], dodge=False, ax=ax)
ax.grid(axis="x")
plt.show()

In [None]:
#Nature - Most Frequent Tags in "Nature" Journal Abstract
import seaborn as sns
tags_list = english_data_filtered[english_data_filtered["journal"]=="BMJ"]["tags"].sum()
map_lst = list(map(lambda x: list(x.keys())[0], tags_list))
english_data_filtered_tags = pd.DataFrame(map_lst, columns=['tag','type'])
english_data_filtered_tags["count"] = 1
english_data_filtered_tags = english_data_filtered_tags.groupby(['type',  
                'tag']).count().reset_index().sort_values("count", 
                 ascending=False)
fig, ax = plt.subplots()
fig.suptitle("Top frequent tags", fontsize=12)
sns.barplot(x="count", y="tag", hue="type", 
            data=english_data_filtered_tags.iloc[:10,:], dodge=False, ax=ax)
ax.grid(axis="x")
plt.show()

### Length Analysis

In [None]:
english_data_filtered['word_count'] = english_data_filtered["Cleaned Abstract"].apply(lambda x: len(str(x).split(" ")))
english_data_filtered['char_count'] = english_data_filtered["Cleaned Abstract"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
english_data_filtered['sentence_count'] = english_data_filtered["Cleaned Abstract"].apply(lambda x: len(str(x).split(".")))
english_data_filtered['avg_word_length'] = english_data_filtered['char_count'] / english_data_filtered['word_count']
english_data_filtered['avg_sentence_length'] = english_data_filtered['word_count'] / english_data_filtered['sentence_count']
english_data_filtered.head()

In [None]:
import seaborn as sns
x, y = "char_count", "journal"
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle(x, fontsize=12)
for i in english_data_filtered[y].unique():
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=True, kde=False, 
                 bins=10, hist_kws={"alpha":0.8}, 
                 axlabel="histogram", ax=ax[0])
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=False, kde=True, 
                 kde_kws={"shade":True}, axlabel="density",   
                 ax=ax[1])
ax[0].grid(True)
ax[0].legend(english_data_filtered[y].unique())
ax[1].grid(True)
plt.show()

In [None]:
x, y = "word_count", "journal"
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle(x, fontsize=12)
for i in english_data_filtered[y].unique():
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=True, kde=False, 
                 bins=10, hist_kws={"alpha":0.8}, 
                 axlabel="histogram", ax=ax[0])
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=False, kde=True, 
                 kde_kws={"shade":True}, axlabel="density",   
                 ax=ax[1])
ax[0].grid(True)
ax[0].legend(english_data_filtered[y].unique())
ax[1].grid(True)
plt.show()

In [None]:
x, y = "avg_word_length", "journal"
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle(x, fontsize=12)
for i in english_data_filtered[y].unique():
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=True, kde=False, 
                 bins=10, hist_kws={"alpha":0.8}, 
                 axlabel="histogram", ax=ax[0])
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=False, kde=True, 
                 kde_kws={"shade":True}, axlabel="density",   
                 ax=ax[1])
ax[0].grid(True)
ax[0].legend(english_data_filtered[y].unique())
ax[1].grid(True)
plt.show()

In [None]:
x, y = "avg_sentence_length", "journal"
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle(x, fontsize=12)
for i in english_data_filtered[y].unique():
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=True, kde=False, 
                 bins=10, hist_kws={"alpha":0.8}, 
                 axlabel="histogram", ax=ax[0])
    sns.distplot(english_data_filtered[english_data_filtered[y]==i][x], hist=False, kde=True, 
                 kde_kws={"shade":True}, axlabel="density",   
                 ax=ax[1])
ax[0].grid(True)
ax[0].legend(english_data_filtered[y].unique())
ax[1].grid(True)
plt.show()

### Word Frequency 

In [None]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
#PLoS One
corpus = english_data_filtered[english_data_filtered["journal"]=="PLoS One"]["Cleaned Abstract"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)
    
## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
english_data_filtered_uni_PLoSOne = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
english_data_filtered_uni_PLoSOne.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)
    
## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
english_data_filtered_bi_PLoSOne = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
english_data_filtered_bi_PLoSOne["Word"] = english_data_filtered_bi_PLoSOne["Word"].apply(lambda x: " ".join(
                   string for string in x) )
english_data_filtered_bi_PLoSOne.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

In [None]:
#BMJ
corpus = english_data_filtered[english_data_filtered["journal"]=="BMJ"]["Cleaned Abstract"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)
    
## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
english_data_filtered_uni_BMJ = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
english_data_filtered_uni_BMJ.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)
    
## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
english_data_filtered_bi_BMJ = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
english_data_filtered_bi_BMJ["Word"] = english_data_filtered_bi_BMJ["Word"].apply(lambda x: " ".join(
                   string for string in x) )
english_data_filtered_bi_BMJ.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

In [None]:
#bioRxiv
corpus = english_data_filtered[english_data_filtered["journal"]=="bioRxiv"]["Cleaned Abstract"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
fig, ax = plt.subplots(nrows=1, ncols=2)
fig.suptitle("Most frequent words", fontsize=15)
    
## unigrams
dic_words_freq = nltk.FreqDist(lst_tokens)
english_data_filtered_uni_bioRxiv = pd.DataFrame(dic_words_freq.most_common(), 
                       columns=["Word","Freq"])
english_data_filtered_uni_bioRxiv.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Unigrams", ax=ax[0], 
                  legend=False).grid(axis='x')
ax[0].set(ylabel=None)
    
## bigrams
dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
english_data_filtered_bi_bioRxiv = pd.DataFrame(dic_words_freq.most_common(), 
                      columns=["Word","Freq"])
english_data_filtered_bi_bioRxiv["Word"] = english_data_filtered_bi_bioRxiv["Word"].apply(lambda x: " ".join(
                   string for string in x) )
english_data_filtered_bi_bioRxiv.set_index("Word").iloc[:10,:].sort_values(by="Freq").plot(
                  kind="barh", title="Bigrams", ax=ax[1],
                  legend=False).grid(axis='x')
ax[1].set(ylabel=None)
plt.show()

### Word Count

In [None]:
from wordcloud import WordCloud
wc = WordCloud(background_color='black', max_words=100, 
                         max_font_size=35)
corpus = english_data_filtered[english_data_filtered["journal"]=="bioRxiv"]["Cleaned Abstract"]
wc_bioRxiv = wc.generate(str(corpus))
fig = plt.figure(num=1)
plt.axis('off')
plt.imshow(wc, cmap=None)
plt.show()

In [None]:
wc = WordCloud(background_color='black', max_words=100, 
                         max_font_size=35)
corpus = english_data_filtered[english_data_filtered["journal"]=="BMJ"]["Cleaned Abstract"]
wc_BMJ = wc.generate(str(corpus))
fig = plt.figure(num=1)
plt.axis('off')
plt.imshow(wc, cmap=None)
plt.show()

In [None]:
wc = WordCloud(background_color='black', max_words=100, 
                         max_font_size=35)
corpus = english_data_filtered[english_data_filtered["journal"]=="PLoS One"]["Cleaned Abstract"]
wc_PLoSOne = wc.generate(str(corpus))
fig = plt.figure(num=1)
plt.axis('off')
plt.imshow(wc, cmap=None)
plt.show()

### Text Preprocessing 

In [None]:
english_data_filtered.columns

In [None]:
dtf= english_data_filtered.drop(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id','license', 'abstract', 'publish_time', 'authors','mag_id','who_covidence_id','arxiv_id','pdf_json_files','pmc_json_files','url','s2_id','Title Language'] , axis=1)

In [None]:
dtf.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dtf['y'] = le.fit_transform(dtf['journal'])

In [None]:
dtf.head()

In [None]:
dtf['y'].value_counts()

In [None]:
dtf['journal'].value_counts()

In [None]:
dtf=dtf.drop(['sentence_count'],axis=1)

In [None]:
dtf.describe()

In [None]:
dtf.head()

In [None]:
dtf=dtf.drop(['char_count'],axis=1)

In [None]:
#Bigrams
english_data_filtered_bi_bioRxiv

### Feature Engineering 

Tf-idf weight is composed by two terms: The first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.<br>

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)<br>
IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
abstract_text = list(dtf['Cleaned Abstract'].values)
vect_word = TfidfVectorizer(max_features=10000, analyzer='word', stop_words='english', ngram_range=(1,2), dtype=np.float32) 
vect_word.fit(abstract_text)
tfidf_complete = vect_word.transform(abstract_text)

In [None]:
print(tfidf_complete)

In [None]:
tfidf_complete.shape

In [None]:
print ("Number of features : %d \n" %len(vect_word.get_feature_names()))
print ("Show some feature names : \n", vect_word.get_feature_names()[::100])

In [None]:
tfidf = dict(zip(vect_word.get_feature_names(), vect_word.idf_))
tfidf = pd.DataFrame(columns=['tfidf_complete']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf_complete']
tfidf.sort_values(by=['tfidf_complete'], ascending=False)

In [None]:
import scipy.sparse as sparse
plt.figure(figsize=(10,10))
plt.spy(tfidf_complete,markersize=0.015)

### Feature Selection 

In [None]:
x_train,x_test,y_train, y_test = train_test_split( dtf['Cleaned Abstract'], dtf['y'], test_size=0.3)

In [None]:
vect_word = TfidfVectorizer(max_features=10000, analyzer='word', stop_words='english', ngram_range=(1,2), dtype=np.float32)
train_tfidf = vect_word.fit_transform(x_train)
test_tfidf = vect_word.transform(x_test)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
def select_features(train_X, train_y, test_X, k):
    if k == 'all':
        return train_X, test_X

    selector = SelectKBest(chi2, k=k)
    selector.fit(train_X, train_y)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X

In [None]:
tfidf_train_final,tfidf_test_final = select_features(train_tfidf,y_train,test_tfidf,627) 

In [None]:
tfidf_train_final.shape

In [None]:
tfidf_test_final.shape

In [None]:
print(tfidf_train_final)

In [None]:
plt.figure(figsize=(20,100))
plt.spy(tfidf_train_final,markersize=0.15)

### Training and Testing of the Model

### SVM

In [None]:
linear_svc = LinearSVC(multi_class='ovr')
linear_svc.fit(tfidf_train_final,y_train)
predictions = linear_svc.predict(tfidf_test_final)
print ("Accuracy of this SVM = " + str(metrics.accuracy_score(y_test, predictions)))
print ("Confusion matrix = " + str(metrics.confusion_matrix(y_test, predictions)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(tfidf_train_final,y_train)
predictions = model.predict(tfidf_test_final)
print ("Accuracy of this KNN = " + str(metrics.accuracy_score(y_test, predictions)))
print ("Confusion matrix = " + str(metrics.confusion_matrix(y_test, predictions)))


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
error_rates=[]
for i in np.arange(1, 100):
    new_model = KNeighborsClassifier(n_neighbors = i)
    new_model.fit(tfidf_train_final,y_train)
    new_predictions = new_model.predict(tfidf_test_final)
    error_rates.append(np.mean(new_predictions != y_test))
plt.plot(error_rates)

In [None]:
model = KNeighborsClassifier(n_neighbors = 18)
model.fit(tfidf_train_final,y_train)
predictions = model.predict(tfidf_test_final)
print ("Accuracy of this KNN = " + str(metrics.accuracy_score(y_test, predictions)))
print ("Confusion matrix = " + str(metrics.confusion_matrix(y_test, predictions)))

### XGBoost 

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
xgbc = XGBClassifier()
xgbc.fit(tfidf_train_final,y_train)
scores = cross_val_score(xgbc, tfidf_train_final, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

In [None]:
predictions = xgbc.predict(tfidf_test_final)
print ("Accuracy of this XGBoost = " + str(metrics.accuracy_score(y_test, predictions)))
print ("Confusion matrix = " + str(metrics.confusion_matrix(y_test, predictions)))