In [1]:
import pandas as pd

In [26]:
df_citation = pd.read_csv("citation.csv")

In [27]:
#Show first 5 rows
df_citation.head()

Unnamed: 0,doi,times_cited
0,10.1177/20563051221138753,9
1,10.1177/20563051221138758,4
2,10.1177/2056305117733224,32
3,10.1177/2056305120949268,1
4,10.1177/2056305119898778,17


In [28]:
#Show columns
df_citation.columns

Index(['doi', 'times_cited'], dtype='object')

In [29]:
#Show dtypes
df_citation.dtypes

doi            object
times_cited     int64
dtype: object

In [30]:
#Show missing
df_citation.isna().sum()

doi            0
times_cited    0
dtype: int64

In [31]:
# 1. calculating mean for trusted to have a cut-off point

df_citation['times_cited'].describe()

count    473.000000
mean      31.818182
std       62.853380
min        0.000000
25%        4.000000
50%       14.000000
75%       35.000000
max      789.000000
Name: times_cited, dtype: float64

In [32]:
# Function to classify as highly cited if above average

# cut-off = 31.818182

def highlycited(citations):
    if citations >= 31.818182:
        return True
    else:
        return False

In [36]:
# Here I am going to show you THREE ways to do the same thing (pick one to use):

# First we clear a list
is_highlycited = []
for i in df_citation['times_cited']:
    is_highlycited.append(highlycited(i))

In [None]:
# Next we put the list as a new column
df_citation['is_highlycited'] = is_highlycited

In [38]:
df_citation['is_highlycited2'] = [highlycited(x) for x in df_citation['times_cited']]

In [39]:
df_citation['is_highlycited3'] = df_citation['times_cited'].apply(highlycited)

In [40]:
df_citation.head()

Unnamed: 0,doi,times_cited,is_highlycited,is_highlycited2,is_highlycited3
0,10.1177/20563051221138753,9,False,False,False
1,10.1177/20563051221138758,4,False,False,False
2,10.1177/2056305117733224,32,True,True,True
3,10.1177/2056305120949268,1,False,False,False
4,10.1177/2056305119898778,17,False,False,False


In [42]:
df_citation['is_highlycited'].value_counts()

is_highlycited
False    337
True     136
Name: count, dtype: int64

### Question: What to do if I want a smaller dataset, with only highly cited articles?

In [None]:
# What is the symbol to use?
# What is the condition



## Merging

In [44]:
df_article = pd.read_csv("article_info.csv")

In [45]:
df_article.head()

Unnamed: 0,doi,title,abstract,year,journal.title
0,10.1177/20563051221138753,Discursive Strategies of Blaming: The Language...,Modern politics is permeated by blame games-sy...,2022,Social Media + Society
1,10.1177/20563051221138758,Hate Speech in a Telegram Conspiracy Channel D...,Research has explored how the COVID-19 pandemi...,2022,Social Media + Society
2,10.1177/2056305117733224,Twitter and Non-Elites: Interpreting Power Dyn...,"In May 2013 and March 2015, actress Angelina J...",2017,Social Media + Society
3,10.1177/2056305120949268,“Mischievous Uncles” as Rule Breakers: Interse...,Responses to crises can highlight and exacerba...,2020,Social Media + Society
4,10.1177/2056305119898778,Building Social Media Observatories for Monito...,Social media house a trove of relevant informa...,2020,Social Media + Society


In [48]:
df_article.columns

Index(['doi', 'title', 'abstract', 'year', 'journal.title'], dtype='object')

In [49]:
df_article.dtypes

doi              object
title            object
abstract         object
year              int64
journal.title    object
dtype: object

In [50]:
df_article.isna().sum()

doi               0
title             0
abstract         21
year              0
journal.title     0
dtype: int64

In [51]:
df_full = pd.merge(df_citation, df_article, how = 'left', left_on = 'doi', right_on='doi')

In [52]:
df_full.head()

Unnamed: 0,doi,times_cited,is_highlycited,is_highlycited2,is_highlycited3,title,abstract,year,journal.title
0,10.1177/20563051221138753,9,False,False,False,Discursive Strategies of Blaming: The Language...,Modern politics is permeated by blame games-sy...,2022,Social Media + Society
1,10.1177/20563051221138758,4,False,False,False,Hate Speech in a Telegram Conspiracy Channel D...,Research has explored how the COVID-19 pandemi...,2022,Social Media + Society
2,10.1177/2056305117733224,32,True,True,True,Twitter and Non-Elites: Interpreting Power Dyn...,"In May 2013 and March 2015, actress Angelina J...",2017,Social Media + Society
3,10.1177/2056305120949268,1,False,False,False,“Mischievous Uncles” as Rule Breakers: Interse...,Responses to crises can highlight and exacerba...,2020,Social Media + Society
4,10.1177/2056305119898778,17,False,False,False,Building Social Media Observatories for Monito...,Social media house a trove of relevant informa...,2020,Social Media + Society


In [54]:
df_full.to_csv('article_with_citation.csv')