# Data Structures in Python

In [1]:
import pandas as pd

In [3]:
star = pd.read_excel('./data/star.xlsx')

In [4]:
star.head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn
0,473,447,small.class,7,girl,no,white,63
1,536,450,small.class,21,girl,no,black,20
2,463,439,regular.with.aide,0,boy,yes,black,19
3,559,448,regular,16,boy,no,white,69
4,489,447,small.class,5,boy,yes,white,79


In [6]:
districts = pd.read_csv('./data/districts.csv')

In [7]:
districts.head()

Unnamed: 0,schidkn,school_name,county
0,1,Rosalia,New Liberty
1,2,Montgomeryville,Topton
2,3,Davy,Wahpeton
3,4,Steelton,Palestine
4,6,Tolchester,Sattley


In [8]:
star.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5748 entries, 0 to 5747
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tmathssk  5748 non-null   int64 
 1   treadssk  5748 non-null   int64 
 2   classk    5748 non-null   object
 3   totexpk   5748 non-null   int64 
 4   sex       5748 non-null   object
 5   freelunk  5748 non-null   object
 6   race      5748 non-null   object
 7   schidkn   5748 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 359.4+ KB


In [9]:
districts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   schidkn      89 non-null     int64 
 1   school_name  89 non-null     object
 2   county       89 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.2+ KB


In [10]:
star.describe()

Unnamed: 0,tmathssk,treadssk,totexpk,schidkn
count,5748.0,5748.0,5748.0,5748.0
mean,485.648051,436.742345,9.307411,39.836639
std,47.771531,31.772857,5.7677,22.957552
min,320.0,315.0,0.0,1.0
25%,454.0,414.0,5.0,20.0
50%,484.0,433.0,9.0,39.0
75%,513.0,453.0,13.0,60.0
max,626.0,627.0,27.0,80.0


In [11]:
star.describe(include = 'all')

Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn
count,5748.0,5748.0,5748,5748.0,5748,5748,5748,5748.0
unique,,,3,,2,2,3,
top,,,regular.with.aide,,boy,no,white,
freq,,,2015,,2954,2973,3869,
mean,485.648051,436.742345,,9.307411,,,,39.836639
std,47.771531,31.772857,,5.7677,,,,22.957552
min,320.0,315.0,,0.0,,,,1.0
25%,454.0,414.0,,5.0,,,,20.0
50%,484.0,433.0,,9.0,,,,39.0
75%,513.0,453.0,,13.0,,,,60.0


## Column-wise operations

Check the data types of the columns in the star dataset:

In [12]:
math_scores = star['tmathssk']
type(math_scores)

pandas.core.series.Series

In [13]:
math_scores = star[['tmathssk']]
type(math_scores)

pandas.core.frame.DataFrame

Keep only the columns that are needed for the analysis:

In [14]:
star = star[['tmathssk','treadssk','classk','totexpk','schidkn']]
star.columns

Index(['tmathssk', 'treadssk', 'classk', 'totexpk', 'schidkn'], dtype='object')

In [16]:
star = star.drop('schidkn', axis=1)
star.columns

Index(['tmathssk', 'treadssk', 'classk', 'totexpk'], dtype='object')

In [17]:
star['new_column'] = star['tmathssk'] + star['treadssk']
star.head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,new_column
0,473,447,small.class,7,920
1,536,450,small.class,21,986
2,463,439,regular.with.aide,0,902
3,559,448,regular,16,1007
4,489,447,small.class,5,936


In [18]:
star = star.rename(columns = {'new_column':'ttl_score'})
star.columns

Index(['tmathssk', 'treadssk', 'classk', 'totexpk', 'ttl_score'], dtype='object')

## Row-wise operations

Sorting:

In [19]:
star.sort_values(by=['classk', 'tmathssk']).head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,ttl_score
309,320,360,regular,6,680
1470,320,315,regular,3,635
2326,339,388,regular,6,727
2820,354,398,regular,6,752
4925,354,391,regular,8,745


In [20]:
star.sort_values(by=['classk', 'tmathssk'], ascending=[True, False]).head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,ttl_score
724,626,474,regular,15,1100
1466,626,554,regular,11,1180
1634,626,580,regular,15,1206
2476,626,538,regular,20,1164
2495,626,522,regular,7,1148


Filtering:

In [21]:
small_class = star['classk'] == 'small.class'
small_class.head()

0     True
1     True
2    False
3    False
4     True
Name: classk, dtype: bool

In [22]:
star_filtered = star[small_class]
star_filtered.shape

(1733, 5)

In [23]:
star.shape

(5748, 5)

In [24]:
star_filtered = star[star['treadssk'] >= 500]
star_filtered.shape

(233, 5)

In [25]:
star_filtered = star[(star['treadssk'] >= 500) & (star['classk'] == 'small.class')]
star_filtered.shape

(84, 5)

## Aggregating and joining

In [26]:
star_grouped = star.groupby('classk')
star_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000181E40BB8D0>

In [27]:
star_grouped[['tmathssk']].mean()

Unnamed: 0_level_0,tmathssk
classk,Unnamed: 1_level_1
regular,483.261
regular.with.aide,483.009926
small.class,491.470283


In [28]:
star.groupby('totexpk')[['ttl_score']].max().head()

Unnamed: 0_level_0,ttl_score
totexpk,Unnamed: 1_level_1
0,1171
1,1133
2,1091
3,1203
4,1229


In [30]:
star = pd.read_excel('data/star.xlsx')
districts = pd.read_csv('data/districts.csv')

In [31]:
star.merge(districts, how='left').head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn,school_name,county
0,473,447,small.class,7,girl,no,white,63,Ridgeville,New Liberty
1,536,450,small.class,21,girl,no,black,20,South Heights,Selmont
2,463,439,regular.with.aide,0,boy,yes,black,19,Bunnlevel,Sattley
3,559,448,regular,16,boy,no,white,69,Hokah,Gallipolis
4,489,447,small.class,5,boy,yes,white,79,Lake Mathews,Sugar Mountain


## Reshaping data

In [32]:
star_pivot = pd.melt(frame=star, id_vars='schidkn', value_vars=['tmathssk', 'treadssk'], value_name='score', var_name='test_type')

In [34]:
star_pivot.head()

Unnamed: 0,schidkn,test_type,score
0,63,tmathssk,473
1,20,tmathssk,536
2,19,tmathssk,463
3,69,tmathssk,559
4,79,tmathssk,489


Rename records in the test_type column:

In [35]:
mapping = {'tmathssk': 'math', 'treadssk': 'reading'}
star_pivot['test_type'] = star_pivot['test_type'].map(mapping)

Find unique values in the test_type column:

In [36]:
star_pivot['test_type'].unique()

array(['math', 'reading'], dtype=object)

In [37]:
star_pivot.pivot_table(index='schidkn', columns='test_type', values='score').reset_index()

test_type,schidkn,math,reading
0,1,492.272727,443.848485
1,2,450.576923,407.153846
2,3,491.452632,441.000000
3,4,467.689655,421.620690
4,5,460.084746,427.593220
...,...,...,...
74,75,504.329268,440.036585
75,76,490.260417,431.666667
76,78,468.457627,417.983051
77,79,490.500000,434.451613
