In [268]:
import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv("wellcome.csv", encoding ='cp1252')
df.head()

Unnamed: 0,PMID/PM,Publisher,Journal tit,Article titl,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psycholog,Reduced,£0.00
1,PMC36795,ACS,Biomacro,Structural,£2381.04
2,23043264,ACS,J Med Che,Fumaroyl,£642.56
3,23438330,ACS,J Med Che,Orvinols,£669.64
4,23438216,ACS,J Org Che,Regiosele,£685.88


## I want to remove/drop all rows that have NAN on it.

In [270]:
new_df = df.dropna()
new_df.head()

Unnamed: 0,PMID/PM,Publisher,Journal tit,Article titl,COST (£) charged to Wellcome (inc VAT when charged)
1,PMC36795,ACS,Biomacro,Structural,£2381.04
2,23043264,ACS,J Med Che,Fumaroyl,£642.56
3,23438330,ACS,J Med Che,Orvinols,£669.64
4,23438216,ACS,J Org Che,Regiosele,£685.88
5,PMC35794,ACS,Journal of,Comparat,£2392.20


## I want rename the column PMID/PM to PMID and COST (£) charged to Wellcome (inc VAT when charged) to cost for easier column search.

In [272]:
new_df.rename(columns = {'PMID/PM': 'PMID'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


## I want to rename the COST (£) charged to Wellcome (inc VAT when charged), to just Cost for simplicity.

In [273]:
new_df.rename(columns = {'COST (£) charged to Wellcome (inc VAT when charged)': 'cost'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


## Printing out the .head to see a preview of what changes.

In [275]:
new_df.head()

Unnamed: 0,PMID,Publisher,Journal tit,Article titl,cost
1,PMC36795,ACS,Biomacro,Structural,£2381.04
2,23043264,ACS,J Med Che,Fumaroyl,£642.56
3,23438330,ACS,J Med Che,Orvinols,£669.64
4,23438216,ACS,J Org Che,Regiosele,£685.88
5,PMC35794,ACS,Journal of,Comparat,£2392.20


## I only want to search PMC follow by digits only.

In [277]:
new_df[new_df['PMID'].str.contains('([A-Z][0-9])')].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,PMID,Publisher,Journal tit,Article titl,cost
1,PMC36795,ACS,Biomacro,Structural,£2381.04
5,PMC35794,ACS,Journal of,Comparat,£2392.20
6,PMC37092,ACS,Journal of,Mapping,£2367.95
14,PMC3413,ACS Publi,Biochemi,Monomer,£665.64
15,PMC36943,ACS Publi,Journal of,Synthesis,£1006.72


## Now I'm removing the $ and £ symbol on the Cost Column to later obtain the mean, medium, variance without error.

In [279]:
new_df['cost'].replace("[$,£]", '', regex=True).astype(float).head()

1    2381.04
2     642.56
3     669.64
4     685.88
5    2392.20
Name: cost, dtype: float64

## Determine the five most common journals and the total articles for each.

In [281]:
new_df['Journal tit'].value_counts().head(6)

Journal of    277
PLoS One       91
PLoS ONE       62
Proceedin      39
Molecular      35
American       27
Name: Journal tit, dtype: int64

## calculating the Mean, Median, and SD for PLoS One article.

In [304]:
print("The mean: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS One"]['cost'].replace("[$,£]", '', regex=True).astype(float).mean())

print("The median: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS One"]['cost'].replace("[$,£]", '', regex=True).astype(float).median())

print("The SD: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS One"]['cost'].replace("[$,£]", '', regex=True).astype(float).std())

The mean: 
24995.559450549452
The median: 
896.32
The SD: 
148336.48781680054


## calculating the Mean, Median, and SD for PLoS ONE article.

In [306]:
print("The mean: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS ONE"]['cost'].replace("[$,£]", '', regex=True).astype(float).mean())

print("The median: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS ONE"]['cost'].replace("[$,£]", '', regex=True).astype(float).median())

print("The SD: ")
print(new_df.loc[new_df['Journal tit'] == "PLoS ONE"]['cost'].replace("[$,£]", '', regex=True).astype(float).std())

The mean: 
49248.71725806452
The median: 
890.095
The SD: 
216138.48621974917


## calculating the Mean, Median, and SD for Proceedin.

In [307]:
print("The mean: ")
print(new_df.loc[new_df['Journal tit'] == "Proceedin"]['cost'].replace("[$,£]", '', regex=True).astype(float).mean())

print("The median: ")
print(new_df.loc[new_df['Journal tit'] == "Proceedin"]['cost'].replace("[$,£]", '', regex=True).astype(float).median())

print("The SD: ")
print(new_df.loc[new_df['Journal tit'] == "Proceedin"]['cost'].replace("[$,£]", '', regex=True).astype(float).std())

The mean: 
993.6494871794872
The median: 
792.0
The SD: 
489.82358956512786


## calculating the Mean, Median, and SD for Molecular.

In [308]:
print("The mean: ")
print(new_df.loc[new_df['Journal tit'] == "Molecular"]['cost'].replace("[$,£]", '', regex=True).astype(float).mean())

print("The median: ")
print(new_df.loc[new_df['Journal tit'] == "Molecular"]['cost'].replace("[$,£]", '', regex=True).astype(float).median())

print("The SD: ")
print(new_df.loc[new_df['Journal tit'] == "Molecular"]['cost'].replace("[$,£]", '', regex=True).astype(float).std())

The mean: 
59179.34
The median: 
2236.02
The SD: 
234996.2392846206


## calculating the Mean, Median, and SD for American.

In [309]:
print("The mean: ")
print(new_df.loc[new_df['Journal tit'] == "American"]['cost'].replace("[$,£]", '', regex=True).astype(float).mean())

print("The median: ")
print(new_df.loc[new_df['Journal tit'] == "American"]['cost'].replace("[$,£]", '', regex=True).astype(float).median())

print("The SD: ")
print(new_df.loc[new_df['Journal tit'] == "American"]['cost'].replace("[$,£]", '', regex=True).astype(float).std())

The mean: 
2052.1700000000005
The median: 
2040.0
The SD: 
536.7577610058379
