In [1]:
import csv
import numpy as np
import os
import pandas as pd

In [2]:
csv_list = [each for each in os.listdir(os.curdir) if each.endswith('.csv')]

In [3]:
age_train = pd.read_csv('age_train.csv',header=None)
age_train.columns = ['uId','age_group']
app_info = pd.read_csv('app_info.csv',header=None)
app_info.columns = ['appId','category']
user_basic_info = pd.read_csv('user_basic_info.csv',header=None)
user_basic_info.columns = ['uId','gender','city','prodName',
                           'ramCapacity','ramLeftRation','romCapacity',
                           'romLeftRation','color','fontSize',
                           'ct','carrier','os']
user_app_actived = pd.read_csv('user_app_actived.csv',header=None)
user_app_actived.columns = ['uId','appId']
user_behavior_info = pd.read_csv('user_behavior_info.csv',header=None)
user_behavior_info.columns = ['uId','bootTimes','AFuncTimes','BFuncTimes',
                              'CFuncTimes','DFuncTimes','EFuncTimes',
                              'FFuncTimes','FFuncSum']

In [56]:
print(age_train.info(memory_usage='deep'))
age_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010000 entries, 0 to 2009999
Data columns (total 2 columns):
uId          int64
age_group    int64
dtypes: int64(2)
memory usage: 30.7 MB
None


Unnamed: 0,uId,age_group
0,1000001,4
1,1000011,3
2,1000015,5
3,1000019,3
4,1000023,2


In [68]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # 我们假设这不是一个df，而是一个 Series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # 将 bytes 转化成 megabytes
    return "{:03.2f} MB".format(usage_mb)

def optimize_mem(sll):
    print(mem_usage(sll))
    sll_int = sll.select_dtypes(include=['int']) # 用 DataFrame.select_dtypes 来选中表中的 int数据
    converted_int = sll_int.apply(pd.to_numeric,downcast='unsigned') # 用pd.to_numeric()来降低我们的数据类型

    compare_ints = pd.concat([sll_int.dtypes,converted_int.dtypes],axis=1)
    compare_ints.columns = ['before','after']
    
    sll_float = sll.select_dtypes(include=['float'])
    converted_float = sll_float.apply(pd.to_numeric,downcast='float')

    compare_floats = pd.concat([sll_float.dtypes,converted_float.dtypes],axis=1)
    compare_floats.columns = ['before','after']
    
    sll[converted_int.columns] = converted_int
    sll[converted_float.columns] = converted_float

    print(mem_usage(sll))
    return sll

In [71]:
age_train1 = optimize_mem(age_train)
# age_train1

9.58 MB
9.58 MB


In [73]:
print(app_info.info(memory_usage='deep'))
app_info.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188864 entries, 0 to 188863
Data columns (total 2 columns):
appId       188864 non-null object
category    188864 non-null object
dtypes: object(2)
memory usage: 28.0 MB
None


Unnamed: 0,appId,category
0,a006,商务
1,a006,实用工具
2,a0015,影音娱乐
3,a0016,实用工具
4,a0019,动作射击


In [None]:
pd.get_dummies(app_info,columns=['category'])

In [44]:
print(user_app_actived.info(memory_usage='deep'))
user_app_actived.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512500 entries, 0 to 2512499
Data columns (total 2 columns):
uId      int64
appId    object
dtypes: int64(1), object(1)
memory usage: 1.1 GB
None


Unnamed: 0,uId,appId
0,1000008,a00105157#a00109386#a00134746#a0013952#a001451...
1,1000010,a00102782#a00107077#a00109386#a00134746#a00136...
2,1000011,a00158535#a00163116#a00170432#a00187480#a00224...
3,1000013,a00109386#a0011601#a0012768#a00136287#a0014516...
4,1000014,a00109386#a0011894#a0013059#a00134840#a0013578...


In [45]:
print(user_basic_info.info(memory_usage='deep'))
user_basic_info.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512500 entries, 0 to 2512499
Data columns (total 13 columns):
uId              int64
gender           int64
city             object
prodName         object
ramCapacity      float64
ramLeftRation    float64
romCapacity      float64
romLeftRation    float64
color            object
fontSize         float64
ct               object
carrier          object
os               float64
dtypes: float64(6), int64(2), object(5)
memory usage: 975.0 MB
None


Unnamed: 0,uId,gender,city,prodName,ramCapacity,ramLeftRation,romCapacity,romLeftRation,color,fontSize,ct,carrier,os
0,1000001,0,c00145,p00169,3.0,0.43,32.0,0.46,皓月银,1.15,4g,China_Mobile,8.0
1,1000002,1,c0043,p0022,,0.26,,0.59,渐变黑,1.0,wifi,China_Mobile,9.0
2,1000004,1,c00281,p0013,3.0,0.36,32.0,0.09,幻海蓝,1.0,4g#wifi,China_Mobile,8.0
3,1000006,0,c00359,p0045,2.0,0.19,16.0,0.42,灰色,,,China_Mobile,6.0
4,1000007,1,c006,p0097,6.0,0.2,137.0,0.5,宝石蓝,1.0,4g#wifi,China_Telecom,9.0


In [None]:
print(user_behavior_info.info(memory_usage='deep'))
user_behavior_info.head()

In [4]:
user_app_usage = pd.read_csv('user_app_usage.csv',header=None,chunksize=1024)
user_app_usage.columns = ['uId','appId','duration','times','use_date']

In [5]:
for each in user_app_usage:
    each.columns = ['uId','appId','duration','times','use_date']
    print(len(each))
    print(type(each))
    break


1024
<class 'pandas.core.frame.DataFrame'>


In [6]:
# user_app_usage = pd.concat([chunk for chunk in user_app_usage], ignore_index=True)

In [7]:
print(each.info(memory_usage='deep'))
each

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 5 columns):
uId         1024 non-null int64
appId       1024 non-null object
duration    1024 non-null float64
times       1024 non-null float64
use_date    1024 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 156.9 KB
None


Unnamed: 0,uId,appId,duration,times,use_date
0,1000364,a00289850,752.0,2.0,2019-03-01
1,1000364,a00287085,836.0,19.0,2019-03-12
2,1000364,a00289826,166.0,23.0,2019-03-01
3,1000364,a00287085,836.0,19.0,2019-03-13
4,1000364,a00278858,123.0,45.0,2019-03-19
5,1000364,a00289850,731.0,2.0,2019-03-03
6,1000364,a00289791,6758.0,157.0,2019-03-20
7,1000364,a00289850,31.0,4.0,2019-03-29
8,1000364,a00157220,10.0,4.0,2019-03-11
9,1000364,a00290027,1259.0,7.0,2019-03-02


In [9]:
each.describe(include=['O'])

Unnamed: 0,appId,use_date
count,1024,1024
unique,69,30
top,a00289791,2019-03-12
freq,84,43
