# Ex3 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user).

### Step 3. Assign it to a variable called users and use the 'user_id' as index

In [3]:
url = r'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'

users = pd.read_csv(url,
                    sep='|',
                    index_col='user_id')

In [4]:
users.columns

Index(['age', 'gender', 'occupation', 'zip_code'], dtype='object')

### Step 4. See the first 25 entries

In [5]:
users.head(25)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


### Step 5. See the last 10 entries

In [6]:
users.tail(10)

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
934,61,M,engineer,22902
935,42,M,doctor,66221
936,24,M,other,32789
937,48,M,educator,98072
938,38,F,technician,55038
939,26,F,student,33319
940,32,M,administrator,2215
941,20,M,student,97229
942,48,F,librarian,78209
943,22,M,student,77841


### Step 6. What is the number of observations in the dataset?

In [7]:
users.shape[0]

943

### Step 7. What is the number of columns in the dataset?

In [8]:
users.shape[1]

4

### Step 8. Print the name of all the columns.

In [9]:
users.columns

Index(['age', 'gender', 'occupation', 'zip_code'], dtype='object')

### Step 9. How is the dataset indexed?

In [10]:
users.index

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       934, 935, 936, 937, 938, 939, 940, 941, 942, 943],
      dtype='int64', name='user_id', length=943)

### Step 10. What is the data type of each column?

In [11]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 943 entries, 1 to 943
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   age         943 non-null    int64 
 1   gender      943 non-null    object
 2   occupation  943 non-null    object
 3   zip_code    943 non-null    object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB


### Step 11. Print only the occupation column

In [12]:
users.loc[:,'occupation']

user_id
1         technician
2              other
3             writer
4         technician
5              other
           ...      
939          student
940    administrator
941          student
942        librarian
943          student
Name: occupation, Length: 943, dtype: object

### Step 12. How many different occupations are in this dataset?

In [13]:
users.loc[:,'occupation'].nunique()

21

### Step 13. What is the most frequent occupation?

In [14]:
users.groupby('occupation')['occupation'].agg('count').reset_index(name='count').sort_values('count', ascending=False).head(1)


Unnamed: 0,occupation,count
18,student,196


### Step 14. Summarize the DataFrame.

In [15]:
users.describe()

Unnamed: 0,age
count,943.0
mean,34.051962
std,12.19274
min,7.0
25%,25.0
50%,31.0
75%,43.0
max,73.0


### Step 15. Summarize all the columns

In [16]:
users.describe(include='all')


Unnamed: 0,age,gender,occupation,zip_code
count,943.0,943,943,943.0
unique,,2,21,795.0
top,,M,student,55414.0
freq,,670,196,9.0
mean,34.051962,,,
std,12.19274,,,
min,7.0,,,
25%,25.0,,,
50%,31.0,,,
75%,43.0,,,


### Step 16. Summarize only the occupation column

In [17]:
users['occupation'].describe()

count         943
unique         21
top       student
freq          196
Name: occupation, dtype: object

### Step 17. What is the mean age of users?

In [18]:
users['age'].mean()

34.05196182396607

### Step 18. What is the age with least occurrence?

In [19]:
users.groupby('age')['age'].agg('count').reset_index(name='count').sort_values('count', ascending=True).head()

Unnamed: 0,age,count
0,7,1
56,66,1
60,73,1
1,10,1
2,11,1


In [20]:
users['age'].value_counts(ascending=True).head()

age
73    1
10    1
66    1
7     1
11    1
Name: count, dtype: int64

In [21]:
pd.cut(users['age'], 10).value_counts(ascending=True).head()

age
(6.934, 13.6]     8
(66.4, 73.0]      8
(59.8, 66.4]     23
(53.2, 59.8]     36
(40.0, 46.6]     94
Name: count, dtype: int64