In [1]:
import re
import math
import string
import random
import pandas as pd
import numpy as np

First let's create some fake data.

In [2]:
# create fake column names
columns = []
num_columns = 15
alphabet = string.ascii_uppercase + string.ascii_lowercase + string.digits
for d in range(num_columns):
    col_len = random.randint(3, 9)
    column_name = ''.join([random.choice(alphabet) for r in range(col_len)])
    columns.append(column_name)
columns

['SymxUUBCx',
 'RcKB398',
 'uNQMYeBmQ',
 'lOihuqh9a',
 'pIkq',
 '21rkd6',
 'yLdLOm',
 'CLrQTb',
 'LFThQKIw',
 'sYef',
 '00L3k',
 'OjQ',
 'RcFU',
 'IKu',
 'Uxf']

In [3]:
# create fake data
d1 = np.random.randint(100,size=(100,3))
d2 = np.random.randint(100,size=(100,7))

In [4]:
# put columns and data together
df1 = pd.DataFrame(d1, columns=columns[:3])
df2 = pd.DataFrame(d2, columns=columns[1:8])
df2[columns[8]] = np.random.randint(0,2,(100,1))
mixed_type_list = [np.nan]*12 + ['nan']*13 + ["abc23"]*50 + list(np.random.randint(100,size=(25,1)).flatten())
random.shuffle(mixed_type_list)
df2[columns[9]] = mixed_type_list # simulate a messy column with numbers, nans, strings
mixed_num_list = ['50']*50+['25.0']*25+['10']*25
random.shuffle(mixed_num_list)
df2[columns[10]] = mixed_num_list # sumulate a messy column with string num, floats, and actual nums
df2[columns[11]] = ['2020-02-01']*50 + ['2020/02/01']*50

In [5]:
set(df1.columns).intersection(set(df2.columns))

{'RcKB398', 'uNQMYeBmQ'}

In [6]:
set(df1.columns).symmetric_difference(set(df2.columns))

{'00L3k',
 '21rkd6',
 'CLrQTb',
 'LFThQKIw',
 'OjQ',
 'SymxUUBCx',
 'lOihuqh9a',
 'pIkq',
 'sYef',
 'yLdLOm'}

So the dataframes overlap by 2 columns and there are 7 columns that are unique only to one or the other.

In [7]:
df1.to_csv('data1.csv',index=False)
df2.to_csv('data2.csv',index=False)

Now let's do some merging on the dataset.

In [8]:
df1.shape, df2.shape

((100, 3), (100, 11))

In [9]:
df = pd.concat([df1,df2],axis=0,ignore_index=True)
df.head()

Unnamed: 0,SymxUUBCx,RcKB398,uNQMYeBmQ,lOihuqh9a,pIkq,21rkd6,yLdLOm,CLrQTb,LFThQKIw,sYef,00L3k,OjQ
0,51.0,2,37,,,,,,,,,
1,83.0,88,38,,,,,,,,,
2,40.0,30,25,,,,,,,,,
3,27.0,52,66,,,,,,,,,
4,65.0,43,57,,,,,,,,,


In [10]:
 df.tail()

Unnamed: 0,SymxUUBCx,RcKB398,uNQMYeBmQ,lOihuqh9a,pIkq,21rkd6,yLdLOm,CLrQTb,LFThQKIw,sYef,00L3k,OjQ
195,,27,65,20.0,37.0,29.0,73.0,70.0,0.0,abc23,10.0,2020/02/01
196,,13,7,89.0,81.0,52.0,86.0,87.0,0.0,34,25.0,2020/02/01
197,,8,92,51.0,37.0,23.0,46.0,86.0,1.0,abc23,10.0,2020/02/01
198,,82,68,92.0,68.0,43.0,76.0,61.0,1.0,abc23,25.0,2020/02/01
199,,30,57,49.0,34.0,83.0,26.0,43.0,0.0,,50.0,2020/02/01


In [11]:
df.shape

(200, 12)

You can see that we created a single dataframe with 200 rows and filled nans where the columns did not have values. This merging step is simple but it can be more complicated in reality. The combination of datasets row wise can create large numbers of these nan values and overwhelm your RAM.

Then let's do processing/transforms.

In [12]:
col1, col2, col3, col4, col5 = df.columns[1], df.columns[8], df.columns[9], df.columns[10], df.columns[11]
(col1, col2, col3, col4, col5)

('RcKB398', 'LFThQKIw', 'sYef', '00L3k', 'OjQ')

In [13]:
float('nan')

nan

In [15]:
# this is a simple tranform
df[col1] = df[col1]*5

# this one converts to T/F
df[col2] = np.where(df[col2] == 1, True, False)

# col 3 converts to numbers, only casting to numeric 
# if we can bring the entire column
row_ct = df.shape[0]
num_ct = pd.to_numeric(df[col3], errors='coerce').count() # coerce makes nan, count drops nan
# same as before, checks all to be numeric with regex
num_regex = r"^((-)?[0-9]+)(,[0-9]+)*(\.[0-9]+)?$|(^$)"
all_are_nums = all(df[col3].fillna('').astype(str).apply(lambda x: re.match(num_regex, x)))

if (num_ct == row_ct) or all_are_nums:
    df[col3] = pd.to_numeric(df[col3], errors='coerce')
    
# this cleans/filters a column of all non numerical contents
# in each element then does the cast
df[col3] = df[col3].replace(regex=r"[^0-9\\.]", value="")
df[col3] = pd.to_numeric(df[col3], errors='coerce')

# using a custom function to convert col4 back to int
df[col4] = df[col4].apply(lambda m: None if m in [None, float('nan'), np.nan, math.nan] else int(float(m)))

# simple date conversion
df[col5] = pd.to_datetime(df[col5], errors='coerce')

In [16]:
df.head()

Unnamed: 0,SymxUUBCx,RcKB398,uNQMYeBmQ,lOihuqh9a,pIkq,21rkd6,yLdLOm,CLrQTb,LFThQKIw,sYef,00L3k,OjQ
0,51.0,10,37,,,,,,False,,,NaT
1,83.0,440,38,,,,,,False,,,NaT
2,40.0,150,25,,,,,,False,,,NaT
3,27.0,260,66,,,,,,False,,,NaT
4,65.0,215,57,,,,,,False,,,NaT


In [17]:
df.tail()

Unnamed: 0,SymxUUBCx,RcKB398,uNQMYeBmQ,lOihuqh9a,pIkq,21rkd6,yLdLOm,CLrQTb,LFThQKIw,sYef,00L3k,OjQ
195,,135,65,20.0,37.0,29.0,73.0,70.0,False,23.0,10.0,2020-02-01
196,,65,7,89.0,81.0,52.0,86.0,87.0,False,34.0,25.0,2020-02-01
197,,40,92,51.0,37.0,23.0,46.0,86.0,True,23.0,10.0,2020-02-01
198,,410,68,92.0,68.0,43.0,76.0,61.0,True,23.0,25.0,2020-02-01
199,,150,57,49.0,34.0,83.0,26.0,43.0,False,,50.0,2020-02-01


In [18]:
df.dtypes

SymxUUBCx           float64
RcKB398               int64
uNQMYeBmQ             int64
lOihuqh9a           float64
pIkq                float64
21rkd6              float64
yLdLOm              float64
CLrQTb              float64
LFThQKIw               bool
sYef                float64
00L3k               float64
OjQ          datetime64[ns]
dtype: object

This example does all the kinds of things you might do while merging + transforming data, including comparisons dependent on the entire column structure. Now I'll show you how to port these types of things to beam.

In [None]:
df.to_csv('data3.csv',index=False)

# This example was simple but your data problems may be complex.

# For any of your complex data problems I'm available to hire on contract to help you build and scale whatever data decision engine you need for your business. 

# Reach out by messaging inquire@automatedinnovations.net