In [31]:
# compaceCSV.py
# Python 2.7.6

"""
Compare 2 CSV files and highlight the differences using Pandas
"""


import pandas as pd
import os

csv_folder = "..\PROJECTS\compare_csv"

csv1 = "csv1.csv"
csv2 = "csv2.csv"

# Load dataframe from csv and append custom columns
df1 = pd.read_csv(os.path.join(csv_folder, csv1), header = None, names = ["col1", "col2", "col3"])
df2 = pd.read_csv(os.path.join(csv_folder, csv2), header = None, names = ["col1", "col2", "col3"])

print("df1 - {}; df2 - {}".format(len(df1), len(df2)))

df1 - 3; df2 - 4


In [32]:
df2.head()

Unnamed: 0,col1,col2,col3
0,1,2,3
1,11,22,33
2,44,55,66
3,77,88,99


In [33]:
# Add a 4th column as a concatenation of values from all other column
df1["col4"] = df1.col1 + df1.col2 + df1.col3
df1.head()

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,6
1,11,22,33,66
2,111,222,333,666


In [34]:
# Add a 4th column as a concatenation of values from all other column
df2["col4"] = df2.col1 + df2.col2 + df2.col3
df2.head()

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,6
1,11,22,33,66
2,44,55,66,165
3,77,88,99,264


In [35]:
df1 = df1.sort_values(by = ["col4"], ascending = [True])
df2 = df2.sort_values(by = ["col4"], ascending = [True])

In [36]:
df1.head()

Unnamed: 0,col1,col2,col3,col4
0,1,2,3,6
1,11,22,33,66
2,111,222,333,666


In [37]:
# Left outer join rows from both dataframes
left_outer_df = pd.merge(df1, df2, how = "left", on=["col4"])
left_outer_df.head()

Unnamed: 0,col1_x,col2_x,col3_x,col4,col1_y,col2_y,col3_y
0,1,2,3,6,1.0,2.0,3.0
1,11,22,33,66,11.0,22.0,33.0
2,111,222,333,666,,,


In [38]:
# Common rows from both dataframes
common_df = pd.merge(df1, df2, how = "inner", on=["col4"])
common_df.head()

Unnamed: 0,col1_x,col2_x,col3_x,col4,col1_y,col2_y,col3_y
0,1,2,3,6,1,2,3
1,11,22,33,66,11,22,33


In [39]:
# Find differnece between 2 dfs
diff_df = df1[(~df1.col4.isin(common_df.col4))]

# Find differnece between 2 dfs
diff_df1 = df1[(~df1.col4.isin(df2.col4))]

len(diff_df), len(diff_df1)

(1, 1)

In [40]:
print("df1 - {}; df2 - {}".format(len(df1), len(df2)))
print("Left outer join - {}".format(len(left_outer_df)))
print("Common - {}".format(len(common_df)))
print("Diff - {}".format(len(diff_df)))

df1 - 3; df2 - 4
Left outer join - 3
Common - 2
Diff - 1


In [41]:
left_outer_df = left_outer_df.sort_index()  # Sort by index
common_df = common_df.sort_index()  # Sort by index
diff_df = diff_df.sort_index()  # Sort by index

In [42]:
# Export dataframes to csv files
common_df.to_csv(os.path.join(csv_folder, "common.csv"))
diff_df.to_csv(os.path.join(csv_folder, "diff.csv"))
left_outer_df.to_csv(os.path.join(csv_folder, "left-outer.csv"))

# Alternate method

In [43]:
import pandas as pd
import os

csv_folder = "..\PROJECTS\compare_csv"

csv1 = "csv1.csv"
csv2 = "csv2.csv"

# Load dataframe from csv and append custom columns
df3 = pd.read_csv(os.path.join(csv_folder, csv1), header = 0, names = ["col1", "col2", "col3"])
df4 = pd.read_csv(os.path.join(csv_folder, csv2), header = 0, names = ["col1", "col2", "col3"])

In [44]:
df3.head()

Unnamed: 0,col1,col2,col3
0,11,22,33
1,111,222,333


In [45]:
df4.head()

Unnamed: 0,col1,col2,col3
0,11,22,33
1,44,55,66
2,77,88,99


In [46]:
df4["check"] = 1
df4.head()

Unnamed: 0,col1,col2,col3,check
0,11,22,33,1
1,44,55,66,1
2,77,88,99,1


In [47]:
# Common rows from both dataframes
newdf = pd.merge(df3, df4, how = "left", on=["col1", "col2", "col3"])
newdf.head()

Unnamed: 0,col1,col2,col3,check
0,11,22,33,1.0
1,111,222,333,


In [48]:
newdf.check.value_counts()

1    1
Name: check, dtype: int64

In [49]:
diffdf = newdf[newdf.check != 1]
diffdf

Unnamed: 0,col1,col2,col3,check
1,111,222,333,


In [50]:
# Export dataframes to csv files
diffdf.to_csv(os.path.join(csv_folder, "diff1.csv"))