## Process and create pipeline for adult's income dataset

In [1]:
from IPython.display import display
from numpy.random import RandomState
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer, precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn_pandas import DataFrameMapper
import numpy as np
import pandas as pd

In [8]:
# Extract dataset
rs = RandomState(30)
url='adult_test.csv'
df = pd.read_csv(url, delimiter=',',header=None)

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [10]:
# Add column's name into df
df.columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum", 
"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]

In [11]:
df

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,<=50K
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,>50K
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,<=50K


In [13]:
# Take a look at the values of the Income column
print(df.Income.unique())

[' <=50K' ' >50K']


In [14]:
# Define class (<=50K = -1, >50K = 1)
df["Income"] = df["Income"].map({ ' <=50K': -1, ' >50K': 1 })

In [15]:
df

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,-1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,-1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,-1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,-1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States,-1
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,1
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States,1
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England,-1


In [16]:
# Create label (y)
y = df["Income"].values

In [17]:
# delete Income column
df.drop("Income", axis=1, inplace=True,)

In [18]:
df

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,29,Local-gov,115585,Some-college,10,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,50,United-States
96,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States
97,37,Private,202683,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,48,United-States
98,48,Private,171095,Assoc-acdm,12,Divorced,Exec-managerial,Unmarried,White,Female,0,0,40,England


In [19]:
print(df.CapitalGain.value_counts()) # 91.7% of the CapitalGain consists of 0. Now onto the CapitalLoss column
print(df.CapitalLoss.value_counts()) # 95.3% of the CapitalLoss column consists of 0.

0        94
14084     1
14344     1
5013      1
5178      1
2407      1
2174      1
Name: CapitalGain, dtype: int64
0       95
1902     2
1408     1
1573     1
2042     1
Name: CapitalLoss, dtype: int64


In [20]:
# Convert datatype
df.Age = df.Age.astype(float)
df.fnlwgt = df.fnlwgt.astype(float)
df.EducationNum = df.EducationNum.astype(float)
df.HoursPerWeek = df.HoursPerWeek.astype(float)
df.CapitalGain = df.CapitalGain.astype(float)
df.CapitalLoss = df.CapitalLoss.astype(float)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            100 non-null    float64
 1   WorkClass      100 non-null    object 
 2   fnlwgt         100 non-null    float64
 3   Education      100 non-null    object 
 4   EducationNum   100 non-null    float64
 5   MaritalStatus  100 non-null    object 
 6   Occupation     100 non-null    object 
 7   Relationship   100 non-null    object 
 8   Race           100 non-null    object 
 9   Gender         100 non-null    object 
 10  CapitalGain    100 non-null    float64
 11  CapitalLoss    100 non-null    float64
 12  HoursPerWeek   100 non-null    float64
 13  NativeCountry  100 non-null    object 
dtypes: float64(6), object(8)
memory usage: 7.9+ KB


In [22]:
# Check unique values
print(df.WorkClass.unique())
print(df.Education.unique())
print(df.MaritalStatus.unique())
print(df.Occupation.unique())
print(df.Relationship.unique())
print(df.Race.unique())
print(df.Gender.unique())
print(df.NativeCountry.unique())
print(len(df.WorkClass.unique()))

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc']
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th']
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv']
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
[' Male' ' Female']
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England']
7


In [23]:
# Convert string features into numerical
df["WorkClass"] = df["WorkClass"].map({' State-gov':1, ' Self-emp-not-inc':2, ' Private':3, ' Federal-gov':4, ' Local-gov':5,
 ' ?':0, ' Self-emp-inc':6}) 

In [25]:
df.head()

Unnamed: 0,Age,WorkClass,fnlwgt,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry
0,39.0,1,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50.0,2,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38.0,3,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53.0,3,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28.0,3,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [26]:
# Label encode
df = pd.get_dummies(df, columns=["Education", "MaritalStatus", "Occupation", "Relationship",
                                 "Race", "Gender", "NativeCountry",])

In [27]:
df.head()

Unnamed: 0,Age,WorkClass,fnlwgt,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,Education_ 10th,Education_ 11th,Education_ 5th-6th,...,NativeCountry_ ?,NativeCountry_ Cuba,NativeCountry_ England,NativeCountry_ Honduras,NativeCountry_ India,NativeCountry_ Jamaica,NativeCountry_ Mexico,NativeCountry_ Puerto-Rico,NativeCountry_ South,NativeCountry_ United-States
0,39.0,1,77516.0,13.0,2174.0,0.0,40.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,50.0,2,83311.0,13.0,0.0,0.0,13.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,38.0,3,215646.0,9.0,0.0,0.0,40.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,53.0,3,234721.0,7.0,0.0,0.0,40.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,28.0,3,338409.0,13.0,0.0,0.0,40.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [28]:
df.shape

(100, 62)

In [29]:
print(pd.value_counts(pd.Series(y)))

-1    75
 1    25
dtype: int64


In [30]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, stratify=y, random_state=rs)

In [31]:
print(X_train.shape)
print(X_test.shape)

(75, 62)
(25, 62)


In [46]:
# Standardize data
standard_scaler_cols = ["Age", "WorkClass","fnlwgt", "EducationNum", "CapitalGain","CapitalLoss", "HoursPerWeek"]
print(standard_scaler_cols)

['Age', 'WorkClass', 'fnlwgt', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek']


In [47]:
other_cols = list(set(df.columns) - set(standard_scaler_cols))

In [48]:
other_cols

['Relationship_ Own-child',
 'NativeCountry_ Jamaica',
 'NativeCountry_ Puerto-Rico',
 'NativeCountry_ United-States',
 'MaritalStatus_ Married-spouse-absent',
 'Occupation_ Protective-serv',
 'Relationship_ Wife',
 'Education_ 5th-6th',
 'Education_ Doctorate',
 'Education_ HS-grad',
 'Gender_ Female',
 'Gender_ Male',
 'Occupation_ Sales',
 'Occupation_ Other-service',
 'NativeCountry_ Honduras',
 'Race_ Black',
 'Occupation_ Exec-managerial',
 'Relationship_ Not-in-family',
 'Race_ White',
 'NativeCountry_ Cuba',
 'Education_ 9th',
 'Occupation_ Adm-clerical',
 'Education_ 11th',
 'MaritalStatus_ Married-AF-spouse',
 'Occupation_ Craft-repair',
 'Occupation_ Handlers-cleaners',
 'MaritalStatus_ Never-married',
 'Education_ 10th',
 'Occupation_ ?',
 'NativeCountry_ South',
 'Education_ 7th-8th',
 'Education_ Prof-school',
 'Occupation_ Transport-moving',
 'Relationship_ Husband',
 'Race_ Other',
 'Education_ Assoc-voc',
 'Race_ Asian-Pac-Islander',
 'Occupation_ Tech-support',
 'Educ

In [35]:
mapper = DataFrameMapper([([col,], StandardScaler(),) for col in standard_scaler_cols] + [(col, None,) for col in other_cols])
# print(mapper)

In [36]:
mapper

DataFrameMapper(drop_cols=[],
                features=[(['Age'], StandardScaler()),
                          (['WorkClass'], StandardScaler()),
                          (['fnlwgt'], StandardScaler()),
                          (['EducationNum'], StandardScaler()),
                          (['CapitalGain'], StandardScaler()),
                          (['CapitalLoss'], StandardScaler()),
                          (['HoursPerWeek'], StandardScaler()),
                          ('Relationship_ Own-child', None),
                          ('NativeCountry_ Jamaica', None),
                          ('NativeCountry_...
                          ('Gender_ Female', None), ('Gender_ Male', None),
                          ('Occupation_ Sales', None),
                          ('Occupation_ Other-service', None),
                          ('NativeCountry_ Honduras', None),
                          ('Race_ Black', None),
                          ('Occupation_ Exec-managerial', None),
      

In [37]:
clf = LogisticRegression(max_iter=1000,random_state=rs)
pipeline = Pipeline([("scale", mapper,),("logit", clf,)])

strat_kfold = StratifiedKFold(5, random_state=rs,shuffle=True)

estimator = GridSearchCV(pipeline, param_grid={"logit__C": np.power(10, np.arange(-4.0, 5.0)),
        "logit__class_weight": ["balanced", None,],}, scoring=make_scorer(roc_auc_score), cv=strat_kfold,)

estimator.fit(X_train, y_train)
cv_results_df = pd.DataFrame(estimator.cv_results_)

In [49]:
print(pipeline)

Pipeline(steps=[('scale',
                 DataFrameMapper(drop_cols=[],
                                 features=[(['Age'], StandardScaler()),
                                           (['WorkClass'], StandardScaler()),
                                           (['fnlwgt'], StandardScaler()),
                                           (['EducationNum'], StandardScaler()),
                                           (['CapitalGain'], StandardScaler()),
                                           (['CapitalLoss'], StandardScaler()),
                                           (['HoursPerWeek'], StandardScaler()),
                                           ('Relationship_ Own-child', None),
                                           ('NativeCountry_ Jamaic...
                                           ('Occupation_ Other-service', None),
                                           ('NativeCountry_ Honduras', None),
                                           ('Race_ Black', None),
        

In [51]:
print(strat_kfold)
print(estimator)

StratifiedKFold(n_splits=5, random_state=RandomState(MT19937) at 0x811CAB8,
        shuffle=True)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=RandomState(MT19937) at 0x811CAB8,
        shuffle=True),
             estimator=Pipeline(steps=[('scale',
                                        DataFrameMapper(drop_cols=[],
                                                        features=[(['Age'],
                                                                   StandardScaler()),
                                                                  (['WorkClass'],
                                                                   StandardScaler()),
                                                                  (['fnlwgt'],
                                                                   StandardScaler()),
                                                                  (['EducationNum'],
                                                                   StandardScaler()),
  

In [44]:
cv_results_df.sort_values(by="rank_test_score").head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logit__C,param_logit__class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,0.065001,0.030384,0.022952,0.007054,10.0,balanced,"{'logit__C': 10.0, 'logit__class_weight': 'bal...",0.5,0.784091,0.75,0.829545,0.829545,0.738636,0.123022,1
17,0.056249,0.0125,0.018753,0.006249,10000.0,,"{'logit__C': 10000.0, 'logit__class_weight': N...",0.5,0.738636,0.829545,0.784091,0.704545,0.711364,0.113773,2
15,0.069315,0.016304,0.021879,0.007653,1000.0,,"{'logit__C': 1000.0, 'logit__class_weight': None}",0.5,0.738636,0.829545,0.784091,0.704545,0.711364,0.113773,2
14,0.071306,0.017615,0.022572,0.007199,1000.0,balanced,"{'logit__C': 1000.0, 'logit__class_weight': 'b...",0.5,0.738636,0.829545,0.784091,0.704545,0.711364,0.113773,2
16,0.065987,0.018174,0.018754,0.006251,10000.0,balanced,"{'logit__C': 10000.0, 'logit__class_weight': '...",0.5,0.738636,0.829545,0.784091,0.704545,0.711364,0.113773,2


In [45]:
estimator.best_params_

{'logit__C': 10.0, 'logit__class_weight': 'balanced'}

In [39]:
def _build_df_from_confusion_matrix(confusion_matrix, as_fractions=False):
    if as_fractions:
        x = np.array(confusion_matrix)
        x = np.apply_along_axis(
            lambda row: [
                row[0] / (row[0] + row[1]),
                row[1] / (row[0] + row[1])
            ],
            1,
            x
        )
    else:
        x = confusion_matrix
    df = pd.DataFrame(
        x,
        index=["<= 50K", "> 50K"],
        columns=["<= 50K", "> 50K"]
    )
    df.index.names = ["Actual"]
    df.columns.names = ["Predicted"]
    return df

In [41]:
y_train_predicted = estimator.predict(X_train)

print("Training set accuracy score: {}".format(accuracy_score(y_train, y_train_predicted)))
print("Training set AUROC score: {}".format(estimator.score(X_train, y_train)))
print("\nConfusion matrix for training set:")
training_confusion_matrix = confusion_matrix(y_train, y_train_predicted)

display(_build_df_from_confusion_matrix(training_confusion_matrix))

Training set accuracy score: 0.9733333333333334
Training set AUROC score: 0.9821428571428572

Confusion matrix for training set:


Predicted,<= 50K,> 50K
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
<= 50K,54,2
> 50K,0,19


In [42]:
print("Same as above but in fractions:")
display(_build_df_from_confusion_matrix(training_confusion_matrix, as_fractions=True))
print("Precision, recall, f-score:")
print(precision_recall_fscore_support(y_train, y_train_predicted))

# Test score
y_test_predicted = estimator.predict(X_test)
print("Test set accuracy score: {}".format(accuracy_score(y_test, y_test_predicted)))
print("Test set AUROC score: {}".format(estimator.score(X_test, y_test)))
print("\nConfusion matrix for test set:")

test_confusion_matrix = confusion_matrix(y_test, y_test_predicted)
display(_build_df_from_confusion_matrix(test_confusion_matrix))
print("Same as above but in fractions:")

display(_build_df_from_confusion_matrix(test_confusion_matrix, as_fractions=True))

print("Precision, recall, f-score:")
print(precision_recall_fscore_support(y_test, y_test_predicted))


Same as above but in fractions:


Predicted,<= 50K,> 50K
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
<= 50K,0.964286,0.035714
> 50K,0.0,1.0


Precision, recall, f-score:
(array([1.       , 0.9047619]), array([0.96428571, 1.        ]), array([0.98181818, 0.95      ]), array([56, 19], dtype=int32))
Test set accuracy score: 0.88
Test set AUROC score: 0.8640350877192984

Confusion matrix for test set:


Predicted,<= 50K,> 50K
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
<= 50K,17,2
> 50K,1,5


Same as above but in fractions:


Predicted,<= 50K,> 50K
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
<= 50K,0.894737,0.105263
> 50K,0.166667,0.833333


Precision, recall, f-score:
(array([0.94444444, 0.71428571]), array([0.89473684, 0.83333333]), array([0.91891892, 0.76923077]), array([19,  6], dtype=int32))


## Data processing tips using Pandas

### 1. Create dataframe

In [1]:
import pandas as pd
import numpy as np
import gc
import time

rows_count = 50000 # Số bản ghi
columns_count = 50 # Số cột
rand_start = 0
rand_end = 1000

print("Making random DataFrame...")
np_matrix = np.random.randint(rand_start, rand_end, size=(rows_count, columns_count))
df = pd.DataFrame(np_matrix, columns=['column_%d' % i for i in range(columns_count)])
print(df)

Making random DataFrame...
       column_0  column_1  column_2  column_3  column_4  column_5  column_6  \
0           164       803       431       392       166       453       671   
1           896       728       582       610        10       160       633   
2           397       685       425       446       257       981        20   
3            79       690       451       149       950       642       144   
4           944       789       989       926        94       134       577   
...         ...       ...       ...       ...       ...       ...       ...   
49995       477        84       163       330       139       594       890   
49996       573        76       353       731       402       832       766   
49997       860        75       613       433       667       870       707   
49998       995       670       984       611       221       264       536   
49999       412       633       159       863       647       790       164   

       column_7  column_

In [2]:
# Create a dictionary
d = {'col1': [1, 2], 'col2': [3, 4]}
print('Dictionary:\n', d)
# Create a dataframe df1 from d:
df1 = pd.DataFrame(data=d)
print("Dataframe 1:\n", df1)
# Create df2 directly from DataFrame method:
df2 = pd.DataFrame(np.array([[1, 2], [3, 4]]), columns=['col1', 'col2'])
print("Dataframe 2:\n", df2)

# LOAD DATAFRAME FROM CSV:
df = pd.read_csv("advertisement.csv",header = 0)
print("Data loaded: \n", df)

Dictionary:
 {'col1': [1, 2], 'col2': [3, 4]}
Dataframe 1:
    col1  col2
0     1     3
1     2     4
Dataframe 2:
    col1  col2
0     1     2
1     3     4
Data loaded: 
         TV  Radio  Newspaper  Sales
0    230.1   37.8       69.2   22.1
1     44.5   39.3       45.1   10.4
2     17.2   45.9       69.3    9.3
3    151.5   41.3       58.5   18.5
4    180.8   10.8       58.4   12.9
..     ...    ...        ...    ...
195   38.2    3.7       13.8    7.6
196   94.2    4.9        8.1    9.7
197  177.0    9.3        6.4   12.8
198  283.6   42.0       66.2   25.5
199  232.1    8.6        8.7   13.4

[200 rows x 4 columns]


In [4]:
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


### 2. Select data

In [11]:
# In 10 dòng cuối cùng của dữ liệu
print(df.tail(10))

# Lấy dữ liệu cột có tên col1 và in ra màn hình
print(df['TV'])

# Lấy 4 dòng đầu tiên của cột col2 và in ra màn hình
print(df['Radio'].head(4))

# Lấy dữ liệu 2 cột col1 và col2 đồng thời
print(df[['TV','Radio']])
print(df[df.columns[1:5]]) # get data from column Radio to Sales

# select three rows and two columns using loc:
df.loc[1:3, ['Radio', 'Newspaper']]
# Lấy dòng có index =  5
print(df.iloc[5])

# Lấy các dòng có index từ 6 đến 10
print(df.iloc[6:11]) # Chú ý là 11 nhé

# First column not a number, it's a string:
df.loc[:,'TV']

        TV  Radio  Newspaper  Sales
190   39.5   41.1        5.8   10.8
191   75.5   10.8        6.0    9.9
192   17.2    4.1       31.6    5.9
193  166.8   42.0        3.6   19.6
194  149.7   35.6        6.0   17.3
195   38.2    3.7       13.8    7.6
196   94.2    4.9        8.1    9.7
197  177.0    9.3        6.4   12.8
198  283.6   42.0       66.2   25.5
199  232.1    8.6        8.7   13.4
0      230.1
1       44.5
2       17.2
3      151.5
4      180.8
       ...  
195     38.2
196     94.2
197    177.0
198    283.6
199    232.1
Name: TV, Length: 200, dtype: float64
0    37.8
1    39.3
2    45.9
3    41.3
Name: Radio, dtype: float64
        TV  Radio
0    230.1   37.8
1     44.5   39.3
2     17.2   45.9
3    151.5   41.3
4    180.8   10.8
..     ...    ...
195   38.2    3.7
196   94.2    4.9
197  177.0    9.3
198  283.6   42.0
199  232.1    8.6

[200 rows x 2 columns]
     Radio  Newspaper  Sales
0     37.8       69.2   22.1
1     39.3       45.1   10.4
2     45.9       69.3    9.3

0      230.1
1       44.5
2       17.2
3      151.5
4      180.8
       ...  
195     38.2
196     94.2
197    177.0
198    283.6
199    232.1
Name: TV, Length: 200, dtype: float64

In [12]:
# CHECK DATA INFORMATION:
print(df['TV'].describe())

# get all info from df
print(df.describe())

count    200.000000
mean     147.042500
std       85.854236
min        0.700000
25%       74.375000
50%      149.750000
75%      218.825000
max      296.400000
Name: TV, dtype: float64
               TV       Radio   Newspaper       Sales
count  200.000000  200.000000  200.000000  200.000000
mean   147.042500   23.264000   30.554000   14.022500
std     85.854236   14.846809   21.778621    5.217457
min      0.700000    0.000000    0.300000    1.600000
25%     74.375000    9.975000   12.750000   10.375000
50%    149.750000   22.900000   25.750000   12.900000
75%    218.825000   36.525000   45.100000   17.400000
max    296.400000   49.600000  114.000000   27.000000


### 3. Filter data

In [14]:
# FILTER DATA <=> SELECT ... from... WHERE ...
print(df[df['TV']==218.4]) # get record where TV values = 218.4

       TV  Radio  Newspaper  Sales
20  218.4   27.7       53.4   18.0


In [15]:
print(df[df['Radio'] > 20.0]) # get record where Radio values > 20.0

        TV  Radio  Newspaper  Sales
0    230.1   37.8       69.2   22.1
1     44.5   39.3       45.1   10.4
2     17.2   45.9       69.3    9.3
3    151.5   41.3       58.5   18.5
5      8.7   48.9       75.0    7.2
..     ...    ...        ...    ...
187  191.1   28.7       18.2   17.3
190   39.5   41.1        5.8   10.8
193  166.8   42.0        3.6   19.6
194  149.7   35.6        6.0   17.3
198  283.6   42.0       66.2   25.5

[111 rows x 4 columns]


### 4. Sort data

In [17]:
# Sorting data increase by 1 column:
df.sort_values(by=['TV'])

Unnamed: 0,TV,Radio,Newspaper,Sales
130,0.7,39.6,8.7,1.6
155,4.1,11.6,5.7,3.2
78,5.4,29.9,9.4,5.3
56,7.3,28.1,41.4,5.5
126,7.8,38.9,50.6,6.6
...,...,...,...,...
98,289.7,42.3,51.2,25.4
35,290.7,4.1,8.5,12.8
30,292.9,28.3,43.2,21.4
42,293.6,27.7,1.8,20.7


In [18]:
# Sorting data increase by 2 column:
df.sort_values(by=['TV','Sales'])

Unnamed: 0,TV,Radio,Newspaper,Sales
130,0.7,39.6,8.7,1.6
155,4.1,11.6,5.7,3.2
78,5.4,29.9,9.4,5.3
56,7.3,28.1,41.4,5.5
126,7.8,38.9,50.6,6.6
...,...,...,...,...
98,289.7,42.3,51.2,25.4
35,290.7,4.1,8.5,12.8
30,292.9,28.3,43.2,21.4
42,293.6,27.7,1.8,20.7


In [19]:
# Xếp dữ liệu theo thứ tự giảm dần theo cột TV
df.sort_values(by='TV', ascending=False) # Tham số ascending=False

Unnamed: 0,TV,Radio,Newspaper,Sales
101,296.4,36.3,100.9,23.8
42,293.6,27.7,1.8,20.7
30,292.9,28.3,43.2,21.4
35,290.7,4.1,8.5,12.8
98,289.7,42.3,51.2,25.4
...,...,...,...,...
126,7.8,38.9,50.6,6.6
56,7.3,28.1,41.4,5.5
78,5.4,29.9,9.4,5.3
155,4.1,11.6,5.7,3.2


### 5. Update data

In [20]:
# Increase values in TV column + 1
df['TV'] = df['TV'] + 1
df['TV'].head(5)

0    231.1
1     45.5
2     18.2
3    152.5
4    181.8
Name: TV, dtype: float64

In [None]:
# Format lại trường dữ liệu thời gian sẽ sử dụng mẫu định dạng như sau '%m/%d/%Y %H:%M:%S'), trong đó m là tháng, d là ngày, Y là năm, H là giờ, M là phút, S là giây 
# df['REPORT_DATE'] = pandas.to_datetime(df['REPORT_DATE'], format='%m/%d/%Y %H:%M:%S')

In [21]:
# Change index:
# Đặt cột TV làm index
df = df.set_index("TV")
df

Unnamed: 0_level_0,Radio,Newspaper,Sales
TV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
231.1,37.8,69.2,22.1
45.5,39.3,45.1,10.4
18.2,45.9,69.3,9.3
152.5,41.3,58.5,18.5
181.8,10.8,58.4,12.9
...,...,...,...
39.2,3.7,13.8,7.6
95.2,4.9,8.1,9.7
178.0,9.3,6.4,12.8
284.6,42.0,66.2,25.5


In [23]:
# # Reset  index. Sau lệnh này hệ thống sẽ tạo ra một cột tăng dần làm index. Cột index cũ sẽ có tên mới là index_col
df = df.reset_index()
df

Unnamed: 0,TV,Radio,Newspaper,Sales
0,231.1,37.8,69.2,22.1
1,45.5,39.3,45.1,10.4
2,18.2,45.9,69.3,9.3
3,152.5,41.3,58.5,18.5
4,181.8,10.8,58.4,12.9
...,...,...,...,...
195,39.2,3.7,13.8,7.6
196,95.2,4.9,8.1,9.7
197,178.0,9.3,6.4,12.8
198,284.6,42.0,66.2,25.5


In [None]:
# DELETE:
# Xóa cột col1
# df = df.drop(['col1'],axis=1) # axis = 1 thể hiện xóa theo cột

# Xóa cột col1 và col2
# df = df.drop(columns=['col1','col2'])

# Xóa dòng có index 0 và 1
# df = df.drop([0,1])

# Xóa dòng có index = 'Ness' và cột col2
# df = df.drop(index='Ness', columns= ['col2'])

# INSERT:
# Tạo một dòng mới
new_row = {'TV':105.0, 'Radio':8.7, 'Newspaper':9.2, 'Sales':9.7}

# Thêm dòng vào dataframe
df = df.append(new_row, ignore_index=True) # Thêm ingore_index để cho biết row thêm vào ko có giá trị index.
print(df)

# Thêm cột total = cột Listners + cột Plays
df['Total'] = df['TV'] + df['Radio'] + df['Newspaper'] + df['Sales']
print(df)
# Thêm một cột Games có giá trị = 1
df['Games'] = 1
print(df)


### 6. Handle missing data

In [None]:
# Drop tất cả các dòng có từ 1 giá trị NaN trở lên
df = df.dropna(axis=0, how='any')

# Drop tất cả các cột có từ 1 giá trị NaN trở lên
df = df.dropna(axis=1, how='any')

# Drop tất cả các dòng mà có từ 3 giá trị NaN
df = df.dropna(axis=0, thresh=3)

# Drop tất cả các dòng mà tất cả các giá trị đều là NaN
df = df.dropna(axis=0, how='all')

# Replace NaN values:
# Thay thế các giá trị Nan bằng giá trị 100
df = df.fillna(100)

# Thay thế các giá trị Nan trên cột có tên col1 bằng giá trị 'A'
df['TV'] = df['TV'].fillna('A')

# Thay thế các giá trị Nan trong df bằng giá trị liền sau đó (ở dòng sau)
df = df.fillna(axis=0, method='ffill')

# Thay thế các giá trị Nan trong df bằng giá trị liền trước đó (ở dòng trên)
df = df.fillna(axis=0, method='bfill')

### 7. Query data

In [26]:
# Thêm cột total = cột Listners + cột Plays
# df['Total'] = df['TV'] + df['Radio'] + df['Newspaper'] + df['Sales']
df

Unnamed: 0,TV,Radio,Newspaper,Sales,Total
0,231.1,37.8,69.2,22.1,360.2
1,45.5,39.3,45.1,10.4,140.3
2,18.2,45.9,69.3,9.3,142.7
3,152.5,41.3,58.5,18.5,270.8
4,181.8,10.8,58.4,12.9,263.9
...,...,...,...,...,...
195,39.2,3.7,13.8,7.6,64.3
196,95.2,4.9,8.1,9.7,117.9
197,178.0,9.3,6.4,12.8,206.5
198,284.6,42.0,66.2,25.5,418.3


In [27]:
# Data grouping:
# Group theo TV và tính tổng số Total với từng nhóm
df.groupby('TV')['Total'].sum()

TV
1.7       51.6
5.1       25.6
6.4       51.0
8.3       83.3
8.8      104.9
         ...  
290.7    409.6
291.7    317.1
293.9    386.8
294.6    344.8
297.4    458.4
Name: Total, Length: 190, dtype: float64

In [28]:
# Group theo Radio và tính trung bình Sales
df.groupby('Radio')['Sales'].mean()

Radio
0.0      8.80
0.3      8.70
0.4      5.30
0.8      9.40
1.3     10.10
        ...  
47.8    16.70
48.9    17.10
49.0    25.05
49.4    19.20
49.6    23.80
Name: Sales, Length: 167, dtype: float64

In [45]:
# Group theo 2 trường TV và Radio
print(df.groupby(by=['TV','Radio']).sum())
# print(df.groupby(by=['TV']).sum())

             Newspaper  Sales  Total
TV    Radio                         
1.7   39.6         8.7    1.6   51.6
5.1   11.6         5.7    3.2   25.6
6.4   29.9         9.4    5.3   51.0
8.3   28.1        41.4    5.5   83.3
8.8   38.9        50.6    6.6  104.9
...                ...    ...    ...
290.7 42.3        51.2   25.4  409.6
291.7 4.1          8.5   12.8  317.1
293.9 28.3        43.2   21.4  386.8
294.6 27.7         1.8   20.7  344.8
297.4 36.3       100.9   23.8  458.4

[200 rows x 3 columns]


In [33]:
# PIVOT TABLE
pv = pd.pivot_table(df,index=["TV","Radio","Newspaper"])

In [34]:
pv

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sales,Total
TV,Radio,Newspaper,Unnamed: 3_level_1,Unnamed: 4_level_1
1.7,39.6,8.7,1.6,51.6
5.1,11.6,5.7,3.2,25.6
6.4,29.9,9.4,5.3,51.0
8.3,28.1,41.4,5.5,83.3
8.8,38.9,50.6,6.6,104.9
...,...,...,...,...
290.7,42.3,51.2,25.4,409.6
291.7,4.1,8.5,12.8,317.1
293.9,28.3,43.2,21.4,386.8
294.6,27.7,1.8,20.7,344.8
