#####Intro to Pandas
***
Following [this](http://pandas.pydata.org/pandas-docs/stable/10min.html) tutorial.

In [20]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Convenience imports
from IPython.display import display # display(your_stuff)

#####Object Creation
***

In [21]:
# Create a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

In [28]:
# Create a DataFrame

# create dates separated into periods - which are just days
dates = pd.date_range('20130101', periods=6)
display(dates)

# create an arary: np.random.randn(6, 4) -> array(y, x) of random numbers
display(np.random.randn(6, 4))

# pd.DataFrame(row_values, index=row_names, columns=column_names)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
display(df)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D', tz=None)

array([[ 1.01461626,  0.90713026, -1.31882102, -1.28262518],
       [-0.29282161, -0.60469734,  0.29277958,  1.83001052],
       [-0.96803134, -2.51369681, -1.48004624,  0.23880916],
       [-0.94046969, -0.43920537,  1.23091556, -0.34015811],
       [ 1.2542869 ,  0.30401143,  1.11417919,  1.51802581],
       [-1.79681368, -0.8144957 , -2.932213  , -0.04210498]])

Unnamed: 0,A,B,C,D
2013-01-01,1.643279,-0.05374,0.041391,0.632955
2013-01-02,-0.35214,-1.244643,-0.258014,0.588093
2013-01-03,-0.319649,-1.092092,-1.076724,-0.462859
2013-01-04,0.324585,1.308598,0.026085,-0.950834
2013-01-05,1.412181,0.411211,1.487764,1.005027
2013-01-06,-1.779313,0.501028,0.177729,-0.265334


In [44]:
# create dataframe from a dictionary - {col_name: values}
#     >> np.array([[1, 2], [3, 4]], dtype="int32"): 
#        array([[3, 3, 3, 3],
#               [3, 3, 3, 3]], dtype=int32)
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

print ("Dictionary representation:")
display(df2.to_dict())

print ("Data frame representation:")
display(df2)

print ("Check all data types:")
display(df2.dtypes)

Dictionary representation:


{'A': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0},
 'B': {0: Timestamp('2013-01-02 00:00:00'),
  1: Timestamp('2013-01-02 00:00:00'),
  2: Timestamp('2013-01-02 00:00:00'),
  3: Timestamp('2013-01-02 00:00:00')},
 'C': {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0},
 'D': {0: 3, 1: 3, 2: 3, 3: 3},
 'E': {0: 'test', 1: 'train', 2: 'test', 3: 'train'},
 'F': {0: 'foo', 1: 'foo', 2: 'foo', 3: 'foo'}}

Data frame representation:


Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1,3,test,foo
1,1,2013-01-02,1,3,train,foo
2,1,2013-01-02,1,3,test,foo
3,1,2013-01-02,1,3,train,foo


Check all data types:


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#####Viewing Data
***

In [52]:
# see the top & bottom rows of the frame
print "Display all data frame:"
display(df)

print "Display top 2 rows:"
display(df.head(2))

print "Display bottom 3 rows:"
display(df.tail(3))

Display all data frame:


Unnamed: 0,A,B,C,D
2013-01-01,1.643279,-0.05374,0.041391,0.632955
2013-01-02,-0.35214,-1.244643,-0.258014,0.588093
2013-01-03,-0.319649,-1.092092,-1.076724,-0.462859
2013-01-04,0.324585,1.308598,0.026085,-0.950834
2013-01-05,1.412181,0.411211,1.487764,1.005027
2013-01-06,-1.779313,0.501028,0.177729,-0.265334


Display top 2 rows:


Unnamed: 0,A,B,C,D
2013-01-01,1.643279,-0.05374,0.041391,0.632955
2013-01-02,-0.35214,-1.244643,-0.258014,0.588093


Display bottom 3 rows:


Unnamed: 0,A,B,C,D
2013-01-04,0.324585,1.308598,0.026085,-0.950834
2013-01-05,1.412181,0.411211,1.487764,1.005027
2013-01-06,-1.779313,0.501028,0.177729,-0.265334


In [66]:
# display index (row_names), columns, and underlying numpy data
display(df.index)

display(df.columns)

display(df.values) 

display(df.describe())

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D', tz=None)

Index([u'A', u'B', u'C', u'D'], dtype='object')

array([[ 1.6432787 , -0.05373985,  0.0413913 ,  0.63295476],
       [-0.35213996, -1.2446432 , -0.25801363,  0.58809293],
       [-0.31964911, -1.092092  , -1.0767237 , -0.46285857],
       [ 0.32458502,  1.30859832,  0.02608493, -0.95083353],
       [ 1.41218055,  0.41121062,  1.48776372,  1.00502672],
       [-1.77931259,  0.50102838,  0.17772897, -0.26533397]])

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.154824,-0.028273,0.066372,0.091175
std,1.268722,0.987162,0.830289,0.760975
min,-1.779313,-1.244643,-1.076724,-0.950834
25%,-0.344017,-0.832504,-0.186989,-0.413477
50%,0.002468,0.178735,0.033738,0.161379
75%,1.140282,0.478574,0.143645,0.621739
max,1.643279,1.308598,1.487764,1.005027


In [72]:
# take actions on the data
print "Transpose (pivot) the data: "
display(df.T)

print "Sort the data: "
display(df)
display(df.sort_index(axis=1, ascending=False))

Transpose (pivot) the data: 


Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.643279,-0.35214,-0.319649,0.324585,1.412181,-1.779313
B,-0.05374,-1.244643,-1.092092,1.308598,0.411211,0.501028
C,0.041391,-0.258014,-1.076724,0.026085,1.487764,0.177729
D,0.632955,0.588093,-0.462859,-0.950834,1.005027,-0.265334


Sort the data: 


Unnamed: 0,A,B,C,D
2013-01-01,1.643279,-0.05374,0.041391,0.632955
2013-01-02,-0.35214,-1.244643,-0.258014,0.588093
2013-01-03,-0.319649,-1.092092,-1.076724,-0.462859
2013-01-04,0.324585,1.308598,0.026085,-0.950834
2013-01-05,1.412181,0.411211,1.487764,1.005027
2013-01-06,-1.779313,0.501028,0.177729,-0.265334


Unnamed: 0,D,C,B,A
2013-01-01,0.632955,0.041391,-0.05374,1.643279
2013-01-02,0.588093,-0.258014,-1.244643,-0.35214
2013-01-03,-0.462859,-1.076724,-1.092092,-0.319649
2013-01-04,-0.950834,0.026085,1.308598,0.324585
2013-01-05,1.005027,1.487764,0.411211,1.412181
2013-01-06,-0.265334,0.177729,0.501028,-1.779313


In [114]:
"""
    - Open a file with columns and rows
    - clean rows with particular values
    - reindex the data frame
    - add missing data for missing indexes
    - convert columns to list/dict
"""
import pandas as pd
import csv

data_file = open('/Users/whitehat/Desktop/facs_standard.txt', 'r')
file_name = os.path.basename(data_file.name).split('.')[0]

sniffer = csv.Sniffer()
dialect = sniffer.sniff(data_file.read().replace('\r', '\n'))
print dialect.delimiter
# returns ','

data_file.seek(0)
df = pd.read_csv(data_file, sep=dialect.delimiter)
antigens = df.columns[1:]
current_indexes = [int(line.split(":")[0]) for line in df['Sample'] if line not in ('Mean', 'StdDev')]
full_index = range(1, 97)

# ~df['column_name'].isin(some_values) == # df = df[df.Sample != 'Mean'] and df = df[df.Sample != 'StdDev']
df = df.loc[~df['Sample'].isin(['Mean', 'StdDev'])]
df.index += 1
df.index = current_indexes
df = df.reindex(full_index, fill_value=1)
result = {"{} - {}".format(file_name, antigen): list(df[antigen])  for antigen in antigens}
df

	


Unnamed: 0,Sample,CHOk1,CHO HuPTHR,CHO MSPTHR
1,1: Specimen_001_A1_A01.fcs,5.69,13.70,14.20
2,2: Specimen_001_A2_A02.fcs,5.21,12.40,26.70
3,3: Specimen_001_A3_A03.fcs,7.52,2988.00,7844.00
4,4: Specimen_001_A4_A04.fcs,6.20,11.90,20.30
5,5: Specimen_001_A5_A05.fcs,7.01,8.61,12.70
6,6: Specimen_001_A6_A06.fcs,6.24,9.19,11.00
7,7: Specimen_001_A7_A07.fcs,10.30,422.00,647.00
8,8: Specimen_001_A8_A08.fcs,7.47,1006.00,2087.00
9,9: Specimen_001_A9_A09.fcs,9.72,1314.00,2537.00
10,10: Specimen_001_A10_A10.fcs,6.83,3403.00,8035.00
