In [23]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv("WELLCOME_APCspend2013_forThinkful.csv", encoding='latin-1')

df = pd.DataFrame(data)

In [59]:
df.head(200)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,PSYCHOLOGICAL MEDICINE,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,BIOMACROMOLECULES,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J MED CHEM,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J MED CHEM,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J ORG CHEM,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,JOURNAL OF MEDICINAL CHEMISTRY,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,JOURNAL OF PROTEOME RESEARCH,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,MOL PHARM,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS CHEMICAL BIOLOGY,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS CHEMICAL BIOLOGY,Chemical proteomic analysis reveals the drugab...,£1294.78


# A. Determine the five most common journals and the total articles for each

## A-1. Find the five most common journals

In [25]:
data["Journal title"].value_counts()

PLoS One                                                              92
PLoS ONE                                                              62
Journal of Biological Chemistry                                       48
Nucleic Acids Research                                                21
Proceedings of the National Academy of Sciences                       19
                                                                      ..
Cell Host & Microbe                                                    1
BMC Genetics                                                           1
Medical Humanities                                                     1
Cellular and Molecular Cell Sciences                                   1
Birth Defects Research Part A: Clinical and Molecular Teratology       1
Name: Journal title, Length: 984, dtype: int64

__In the row data, we have 928 journals. How many unique titles?__

In [26]:
data["Journal title"].unique()

array(['Psychological Medicine', 'Biomacromolecules', 'J Med Chem',
       'J Org Chem', 'Journal of Medicinal Chemistry',
       'Journal of Proteome Research', 'Mol Pharm',
       'ACS Chemical Biology',
       'Journal of Chemical Information and Modeling', 'Biochemistry',
       'Gastroenterology', 'Journal of Biological Chemistry',
       'Journal of Immunology', 'ACS Chemical Neuroscience', 'ACS NANO',
       'American Chemical Society', 'Analytical Chemistry',
       'Bioconjugate Chemistry', 'Journal of Medicinal Chemistry ',
       'Journal of the American Chemical Society', 'ACS Nano', 'CHEST',
       'Journal of Neurophysiology', 'Journal of Physiology',
       'The Journal of Neurophysiology', 'American Journal of Psychiatry',
       'Americal Journal of Psychiatry', 'Behavioral Neuroscience',
       'Emotion', 'Health Psychology', 'Journal of Abnormal Psychology',
       'Journal of Consulting and Clinical Psychology',
       'Journal of Experimental Psychology:  Animal Be

__Seems like we have to clean up the data a lot.....__
1.  Mixture of upper & lower case (e.g. PLoS One & PLoS ONE )
2.  Missspelling ('International Journal of __Behavioral__ Nutrition and Physical Activity' & 'International Journal of __Behavioural__ Nutrition and Physical Activity',)
3. Abbreviation（PNAS, Proceedings of the National Academy of Sciences, & P.N.A.S.）
4. Different subtitle ('Biochimica et Bioohysica Acta - Gene Regulatory Mechanisms' & 'Biochimica et Bioohysica Acta - Molecular Basis of Disease')
5. Extra Spaces
6. All Blank Cells
7. Formatting

In [27]:
# to get rid of case like PLoS One & PLoS ONE, make every data upper case 
data["Journal title"] = data["Journal title"].str.upper()
data["Journal title"].value_counts()[:20]

PLOS ONE                                           190
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          29
PLOS GENETICS                                       24
PLOS PATHOGENS                                      24
NUCLEIC ACIDS RESEARCH                              23
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     20
PLOS NEGLECTED TROPICAL DISEASES                    20
HUMAN MOLECULAR GENETICS                            19
NATURE COMMUNICATIONS                               19
BRAIN                                               14
BMC PUBLIC HEALTH                                   14
MOVEMENT DISORDERS                                  13
DEVELOPMENTAL CELL                                  12
BIOCHEMICAL JOURNAL                                 12
JOURNAL OF NEUROSCIENCE                             12
CURRENT BIOLOGY                                     11
JOURNAL OF GENERAL VIROLOGY                         11
BMJ       

In [28]:
data["Journal title"] = data["Journal title"].str.strip()
data["Journal title"].value_counts()[:20]

PLOS ONE                                           190
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          29
NUCLEIC ACIDS RESEARCH                              26
PLOS PATHOGENS                                      24
PLOS GENETICS                                       24
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     22
PLOS NEGLECTED TROPICAL DISEASES                    20
NATURE COMMUNICATIONS                               19
HUMAN MOLECULAR GENETICS                            19
MOVEMENT DISORDERS                                  15
BMC PUBLIC HEALTH                                   14
BRAIN                                               14
JOURNAL OF NEUROSCIENCE                             13
BIOCHEMICAL JOURNAL                                 12
DEVELOPMENTAL CELL                                  12
CURRENT BIOLOGY                                     11
JOURNAL OF GENERAL VIROLOGY                         11
BMJ       

In [31]:
# Fix some misspells and abbreviations

data["Journal title"].replace("PNAS", "Proceedings of the National Academy of Sciences".upper(), inplace=True)
data["Journal title"].replace("P.N.A.S.", "Proceedings of the National Academy of Sciences".upper(), inplace=True)
data["Journal title"].replace("International Journal of Behavioural Nutrition and Physical Activity".upper(), "Journal of Behavior Therapy and Experimental Psychiatry".upper(), inplace=True)

data["Journal title"].value_counts()[:30]

PLOS ONE                                           190
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          29
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     28
NUCLEIC ACIDS RESEARCH                              26
PLOS PATHOGENS                                      24
PLOS GENETICS                                       24
PLOS NEGLECTED TROPICAL DISEASES                    20
HUMAN MOLECULAR GENETICS                            19
NATURE COMMUNICATIONS                               19
MOVEMENT DISORDERS                                  15
BMC PUBLIC HEALTH                                   14
BRAIN                                               14
JOURNAL OF NEUROSCIENCE                             13
BIOCHEMICAL JOURNAL                                 12
DEVELOPMENTAL CELL                                  12
CURRENT BIOLOGY                                     11
JOURNAL OF GENERAL VIROLOGY                         11
MALARIA JO

In [29]:
journal_list = data["Journal title"].value_counts().index.tolist()

In [34]:
unclean_top30 = data["Journal title"].value_counts()[:30]
unclean_top30

PLOS ONE                                           190
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          29
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     28
NUCLEIC ACIDS RESEARCH                              26
PLOS PATHOGENS                                      24
PLOS GENETICS                                       24
PLOS NEGLECTED TROPICAL DISEASES                    20
HUMAN MOLECULAR GENETICS                            19
NATURE COMMUNICATIONS                               19
MOVEMENT DISORDERS                                  15
BMC PUBLIC HEALTH                                   14
BRAIN                                               14
JOURNAL OF NEUROSCIENCE                             13
BIOCHEMICAL JOURNAL                                 12
DEVELOPMENTAL CELL                                  12
CURRENT BIOLOGY                                     11
JOURNAL OF GENERAL VIROLOGY                         11
MALARIA JO

In [55]:
journal_list = sorted(journal_list)
len(journal_list)

894

In [58]:
data.index.size

data['Journal title'] = data['Journal title'].replace(to_replace=r'J.*BIO.*C*', value='JOURNAL OF BIOLOGICAL CHEMISTRY', regex=True)

# for i in range(data.index.size):
#     if re.match('J.*BIO.*C*', str(data.loc[i, 'Journal title'])):
#         data.loc[i, 'Journal title'] = "JOURNAL OF BIOLOGICAL CHEMISTRY"

for i in range(data.index.size):
    if re.match('NEURO.*MAGE.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "NEUROIMAGE"
        
for i in range(data.index.size):
    if re.match('PRO.*NATIO.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES"
        
for i in range(data.index.size):
    if re.match('^NAR|^N.A.R|NUC.*ACI.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "NUCLEIC ACIDS RESEARCH"
        
for i in range(data.index.size):
    if re.match('PLO.*\sGEN.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "PLOS GENETICS"
        
for i in range(data.index.size):
    if re.match('PLO.*\sPAT.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "PLOS PATHOGENS"
        
for i in range(data.index.size):
    if re.match('^PNTD|^P.N.T.D|PLO.*\sNEG.*\sTRO.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "PLOS NEGLECTED TROPICAL DISEASE"
        
for i in range(data.index.size):
    if re.match('NAT.*\sCOM.*', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "NATURE COMMUNICATIONS"
        
for i in range(data.index.size):
    if re.match('HUM.*\sMOL.', str(data.loc[i, 'Journal title'])):
        data.loc[i, 'Journal title'] = "HUMAN MOLECULAR GENETICS"    
        

            
clean_top30 = data["Journal title"].value_counts()[:30]
clean_top30[:5]



PLOS ONE                                           190
JOURNAL OF BIOLOGICAL CHEMISTRY                     92
NEUROIMAGE                                          36
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     34
NUCLEIC ACIDS RESEARCH                              29
Name: Journal title, dtype: int64

In [67]:
# if (data["cost"].str.match('£*')).any():
#     plos_one["cost"] = plos_one["cost"].str.replace('£', '')
#     plos_one["cost"] = plos_one["cost"].astype(float)
#     plos_one["cost"] = plos_one["cost"] * 1.5

## A-2. calculate the mean, median, and standard deviation of the open-access cost per article for each journal

In [71]:
plos_one = data[data["Journal title"] == "PLOS ONE"]

data["COST (£) charged to Wellcome (inc VAT when charged)"] = data["COST (£) charged to Wellcome (inc VAT when charged)"].str.replace('£', '')

# data[["Journal title", "COST (£) charged to Wellcome (inc VAT when charged)"]].groupby('Journal title').agg(['max', 'min'])     
     

Unnamed: 0_level_0,COST (£) charged to Wellcome (inc VAT when charged),COST (£) charged to Wellcome (inc VAT when charged)
Unnamed: 0_level_1,max,min
Journal title,Unnamed: 1_level_2,Unnamed: 2_level_2
ACADEMY OF NUTRITION AND DIETETICS,2379.54,2379.54
ACS CHEMICAL BIOLOGY,947.07,1267.76
ACS CHEMICAL NEUROSCIENCE,1186.80,1186.80
ACS NANO,693.39,642.89
"ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL CRYSTALLOGRAPHY",771.42,771.42
...,...,...
VIROLOGY JOURNAL,1242.00,1242.00
VIRUS RESEARCH,1947.09,1947.09
VISION RESEARCH,999999.00,1456.18
VISUAL NEUROSCIENCE,2034.00,2034.00


In [72]:

plos_one["cost"] = plos_one["COST (£) charged to Wellcome (inc VAT when charged)"]
    

# for i in range(190):
#  plos_one["cost"].iloc[i] =  float(plos_one["cost"].iloc[i].replace('£', ''))

# sorted_plos_one = sorted(plos_one["cost"])    
# sorted_plos_one

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [73]:
plos_one.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
cost                                                   object
dtype: object

In [74]:
pd.options.display.max_rows = 200
plos_one

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged),cost
1282,3517619,PLoS,PLOS ONE,HCN1 and HCN2 in Rat DRG Neurons: Levels in No...,1001.03,1001.03
1283,3498109,PLoS,PLOS ONE,Fetal alcohol exposure and IQ at age 8: Eviden...,1004.15,1004.15
1284,3515553,PLoS,PLOS ONE,Vitamin B-12 status during pregnancy and child...,1011.45,1011.45
1285,3522679,PLoS,PLOS ONE,Validation of Dual Energy X-ray Absorptiometry...,1011.45,1011.45
1286,3485223,PLoS,PLOS ONE,Associations of different phenotypes of wheezi...,1015.73,1015.73
1287,PMC3547059,PLoS,PLOS ONE,"""Involvement of EphB1 receptors signalling in ...",1023.41,1023.41
1288,3573029,PLoS,PLOS ONE,Reactive oxygen species modulate the barrier f...,1039.87,1039.87
1289,3769269,PLoS,PLOS ONE,Chronic pravastatin but not atorvastatin treat...,1061.24,1061.24
1290,3782430,PLoS,PLOS ONE,Expression of HIV-1 Vpu Leads to Loss of the V...,1061.24,1061.24
1291,3797097,PLoS,PLOS ONE,Molecular phylogeny of a RING E3 ubiquitin lig...,1061.24,1061.24


In [75]:
# plos_one["cost"] = (plos_one["cost"].str.replace('$', ''))
plos_one["cost"]

1282      1001.03
1283      1004.15
1284      1011.45
1285      1011.45
1286      1015.73
1287      1023.41
1288      1039.87
1289      1061.24
1290      1061.24
1291      1061.24
1292      1080.00
1293       794.93
1294       809.29
1295       819.34
1296       854.96
1297       901.50
1298       902.12
1299       903.89
1303      1061.93
1304       389.73
1305       443.38
1414      1002.86
1415      1005.86
1416      1005.96
1417      1008.97
1418      1009.98
1419      1010.80
1420      1013.03
1421      1019.76
1422      1027.73
1423      1034.95
1424      1036.38
1425      1036.78
1426      1040.21
1427      1040.70
1428      1040.75
1429      1040.87
1430      1041.66
1431      1044.35
1432      1044.55
1433      1044.73
1434      1044.74
1435      1046.74
1436      1047.02
1437      1047.25
1438      1047.25
1439      1048.79
1440      1056.24
1441      1057.92
1442      1060.00
1443      1060.06
1444      1063.55
1445      1063.69
1446      1063.69
1447      1064.04
1448      

In [19]:
plos_one["cost"].mode()

0    825.68
dtype: object

In [76]:
for i in range(190):
    if plos_one["cost"].iloc[i] == 192645.0:
        plos_one["cost"].iloc[i] = 1926.45
    elif plos_one["cost"].iloc[i] == 999999.0:
        plos_one["cost"].iloc[i] = 825.68

sorted(plos_one["cost"])

['1001.03',
 '1002.86',
 '1004.15',
 '1005.86',
 '1005.96',
 '1008.97',
 '1009.98',
 '1010.80',
 '1011.45',
 '1011.45',
 '1013.03',
 '1015.73',
 '1019.76',
 '1023.41',
 '1027.73',
 '1034.95',
 '1036.38',
 '1036.78',
 '1039.87',
 '1040.21',
 '1040.70',
 '1040.75',
 '1040.87',
 '1041.66',
 '1044.35',
 '1044.55',
 '1044.73',
 '1044.74',
 '1046.74',
 '1047.02',
 '1047.25',
 '1047.25',
 '1048.79',
 '1056.24',
 '1057.92',
 '1060.00',
 '1060.06',
 '1061.24',
 '1061.24',
 '1061.24',
 '1061.93',
 '1063.55',
 '1063.69',
 '1063.69',
 '1064.04',
 '1064.04',
 '1064.22',
 '1065.58',
 '1068.11',
 '1068.12',
 '1068.12',
 '1068.19',
 '1068.23',
 '1072.85',
 '1072.92',
 '1075.98',
 '1076.94',
 '1079.64',
 '1080.00',
 '1085.61',
 '122.31',
 '1331.13',
 '1395.78',
 '1541.48',
 '1692.00',
 '1745.00',
 '1775.50',
 '1785.36',
 '192645.00',
 '214.74',
 '329.79',
 '389.73',
 '424.95',
 '443.38',
 '534.26',
 '741.35',
 '747.05',
 '773.45',
 '794.93',
 '809.29',
 '819.34',
 '825.68',
 '825.68',
 '825.68',
 '825.

In [77]:
print("Mean", plos_one["cost"].mean())
print("Median", plos_one["cost"].median())
print("Std", plos_one["cost"].std())

TypeError: Could not convert 1001.031004.151011.451011.451015.731023.411039.871061.241061.241061.241080.00794.93809.29819.34854.96901.50902.12903.891061.93389.73443.381002.861005.861005.961008.971009.981010.801013.031019.761027.731034.951036.381036.781040.211040.701040.751040.871041.661044.351044.551044.731044.741046.741047.021047.251047.251048.791056.241057.921060.001060.061063.551063.691063.691064.041064.041064.221065.581068.111068.121068.121068.191068.231072.851072.921075.981076.941079.641085.611331.131395.781541.481692.001745.001775.501785.36122.31192645.00214.74329.79424.95534.26741.35747.05773.45825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68825.68830.05830.97836.02836.43836.43841.65841.65842.17843.72843.78844.52846.32847.73850.00850.50852.43852.53852.95854.43855.40855.40855.40858.15860.51861.35861.35861.35863.78867.30867.72870.13870.13870.13871.45874.61874.61877.09877.09878.44878.72879.33879.33884.32886.18886.23886.23886.30886.76886.95886.95887.31889.97891.74892.12893.43894.71896.32896.96896.99897.19897.61897.61899.34999999.00999999.00999999.00999999.00999999.00999999.00999999.00900.14904.34907.08908.01909.99910.20913.36915.53918.62953.48960.16961.68961.68961.68986.72996.06 to numeric