In [1]:
import pandas as pd
import numpy as np

# in each case below, append the name of the dataset.csv
web_path = 'https://www.openintro.org/data/csv/' # url path to openintro.org data
loc_path = %pwd # local path if you had downloaded the data

# Chapter 1

---
## Case study: using stents to prevent strokes
[video](https://www.openintro.org/go?id=video_stat_stents_to_prevent_strokes&referrer=/book/os/index.php)

## Table 1.1

In [4]:
p_30 = pd.read_csv(web_path+"stent30.csv", header=0, names=['group', '0-30 days'])
p_365 = pd.read_csv(web_path+"stent365.csv", header=0, names=['group', '0-365 days'])

one_one = p_30.merge(p_365, 
    on='group', 
    left_index=True,
    right_index=True
    ).set_index(np.arange(1, 452))

one_one.index.name = 'patient'
one_one

Unnamed: 0_level_0,group,0-30 days,0-365 days
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,treatment,stroke,stroke
2,treatment,stroke,stroke
3,treatment,stroke,stroke
4,treatment,stroke,stroke
5,treatment,stroke,stroke
...,...,...,...
447,control,no event,no event
448,control,no event,no event
449,control,no event,no event
450,control,no event,no event


## Table 1.2

In [3]:
pieces = {
    '0-30':(p_30
            .pivot_table(index='group', 
            columns=['0-30 days'], 
            aggfunc=len)[::-1].T[::-1]),
            
    '0-365':(p_365
            .pivot_table(index='group', 
            columns=['0-365 days'], 
            aggfunc=len)[::-1].T[::-1])
    
} ## [::-1].T[::-1] to sort the table

one_two = pd.concat(
    pieces, 
    names=['days', 'outcome'],
    ).T

one_two

days,0-30,0-30,0-365,0-365
outcome,stroke,no event,stroke,no event
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
treatment,33,191,45,179
control,13,214,28,199


## Data basics
[video](https://www.openintro.org/go?id=video_stat_data_basics&referrer=/book/os/index.php)

### Table 1.3

In [8]:
one_three = pd.read_csv(
    web_path+"email50.csv", 
    header=0)
one_three.head()

Unnamed: 0,spam,to_multiple,from,cc,sent_email,time,image,attach,dollar,winner,...,viagra,password,num_char,line_breaks,format,re_subj,exclaim_subj,urgent_subj,exclaim_mess,number
0,0,0,1,0,1,2012-01-04 05:19:16,0,0,0,no,...,0,0,21.705,551,1,1,0,0,8,small
1,0,0,1,0,0,2012-02-16 12:10:06,0,0,0,no,...,0,0,7.011,183,1,0,0,0,1,big
2,1,0,1,4,0,2012-01-04 07:36:23,0,2,0,no,...,0,0,0.631,28,0,0,0,0,2,none
3,0,0,1,0,0,2012-01-04 09:49:52,0,0,0,no,...,0,0,2.454,61,0,0,0,0,1,small
4,0,0,1,0,0,2012-01-27 01:34:45,0,0,9,no,...,0,1,41.623,1088,1,0,0,0,43,small


### Table 1.4

#### Variables:

-    **spam** - Indicator for whether the email was spam.
-    **to_multiple** - Indicator for whether the email was addressed to more than one recipient.
-    **from** - Whether the message was listed as from anyone (this is usually set by default for regular outgoing email).
-    **cc** - Indicator for whether anyone was CCed.
-    **sent_email** - Indicator for whether the sender had been sent an email in the last 30 days.
-    **time** - Time at which email was sent.
-    **image** - The number of images attached.
-    **attach** - The number of attached files.
-    **dollar** - The number of times a dollar sign or the word "dollar" appeared in the email.
-    **winner** - Indicates whether "winner" appeared in the email.
-    **inherit** - The number of times "inherit" (or an extension, such as "inheritance") appeared in the email.
-    **viagra** - The number of times "viagra" appeared in the email.
-    **password** - The number of times "password" appeared in the email.
-    **num_char** - The number of characters in the email, in thousands.
-    **line_breaks** - The number of line breaks in the email (does not count text wrapping).
-    **format** - Indicates whether the email was written using HTML (e.g. may have included bolding or active links).
-    **re_subj** - Whether the subject started with "Re:", "RE:", "re:", or "rE:"
-    **exclaim_subj** - Whether there was an exclamation point in the subject.
-    **urgent_subj** - Whether the word "urgent" was in the email subject.
-    **exclaim_mess** - The number of exclamation points in the email message.
-    **period_mess** - The number of periods in the message.
-    **signoff** - Whether a sign-off of "Cheers", "Regards", or "Best" (also, "Best Regards") was used.
-    **number** - Factor variable saying whether there was no number, a small number (under 1 million), or a big number.

---





In [10]:
one_three

Unnamed: 0,spam,to_multiple,from,cc,sent_email,time,image,attach,dollar,winner,...,viagra,password,num_char,line_breaks,format,re_subj,exclaim_subj,urgent_subj,exclaim_mess,number
0,0,0,1,0,1,2012-01-04 05:19:16,0,0,0,no,...,0,0,21.705,551,1,1,0,0,8,small
1,0,0,1,0,0,2012-02-16 12:10:06,0,0,0,no,...,0,0,7.011,183,1,0,0,0,1,big
2,1,0,1,4,0,2012-01-04 07:36:23,0,2,0,no,...,0,0,0.631,28,0,0,0,0,2,none
3,0,0,1,0,0,2012-01-04 09:49:52,0,0,0,no,...,0,0,2.454,61,0,0,0,0,1,small
4,0,0,1,0,0,2012-01-27 01:34:45,0,0,9,no,...,0,1,41.623,1088,1,0,0,0,43,small
5,0,0,1,0,0,2012-01-17 09:31:57,0,0,0,no,...,0,0,0.057,5,0,0,0,0,0,small
6,0,0,1,0,0,2012-03-17 21:18:55,0,0,0,no,...,0,0,0.809,17,0,0,0,0,0,small
7,0,0,1,0,1,2012-03-31 06:58:56,0,0,0,no,...,0,0,5.229,88,1,1,0,0,2,small
8,0,0,1,1,1,2012-01-10 17:57:54,0,0,0,no,...,0,0,9.277,242,1,1,1,0,22,small
9,0,0,1,0,0,2012-01-07 11:29:16,0,0,23,no,...,0,0,17.17,578,1,0,0,0,3,small
