In [1]:
#import panda and numpy

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the dataset

In [4]:
df = pd.read_csv('social_media_interactions_contaminated.csv')

In [5]:
df

Unnamed: 0,CustomerID,InteractionID,InteractionDate,Platform,InteractionType,Sentiment
0,2dcb9523-356b-40b2-a67b-1f27797de261,e5d15761-d0a7-4329-89e3-79a892c56097,2023-07-11,,Comment,
1,e12c37b3-7d4d-472f-9fd8-0df2cb3001aa,02f9f376-70ae-4fcd-9070-1db977939948,2023-07-06,Twitter,Share,
2,08a911a3-65e6-4f5d-a6a1-ae7ddcbe28a2,a83fa04c-f109-4f24-8ce1-2078154f6a1c,2024-05-24,Instagram,Comment,Neutral
3,efdfdfc9-5dbb-4478-911a-101a390a0285,28a69c4b-a2e4-4c74-a130-1132d7733fdf,2023-11-01,Instagram,Like,Neutral
4,ca1e90f6-0e5f-492e-ab92-252ff540da18,d9d1c6f8-5e15-4738-b52b-13c2982420cc,2023-07-08,Instagram,Like,
...,...,...,...,...,...,...
3195,8d8fe8bd-2009-4a15-be37-83dba3cef447,2212803c-4a00-4263-bae5-aaaff6d6bdb1,2023-08-30,Instagram,Like,Neutral
3196,1756838b-8368-4852-b4a5-32a23643a3dd,2f50ca57-f930-4d3e-ae08-9f3b26c7f148,2024-06-16,Facebook,Like,Neutral
3197,eb8bb3ce-e182-4a9c-8084-c55c738815e1,ee2c3d14-248c-48b0-aa80-4ea97e449bf8,2023-12-22,Instagram,Comment,Negative
3198,a5d08a95-9dba-4e3d-b997-1cfdde6a89ef,e77a8713-fdc7-4f71-85f8-9cd88cca0c42,2023-11-01,Twitter,Share,Positive


In [6]:
# Check for initial duplicate rows

In [7]:
initial_duplicates = df.duplicated().sum()
print(f'Initial duplicate count: {initial_duplicates}')

Initial duplicate count: 180


In [8]:
# Remove duplicate rows

In [9]:
df_cleaned = df.drop_duplicates()

In [10]:
# Verify duplicate count after dropping

In [11]:
duplicates_after = df_cleaned.duplicated().sum()
print(f'Duplicate count after dropping: {duplicates_after}')

Duplicate count after dropping: 0


In [12]:
# Count null values in each column

In [13]:
null_counts = df_cleaned.isnull().sum()
print('Null values in each column:')
print(null_counts)

Null values in each column:
CustomerID           0
InteractionID        0
InteractionDate      0
Platform           291
InteractionType      0
Sentiment          309
dtype: int64


In [14]:
# Drop all rows with any null values

In [15]:
df_cleaned = df_cleaned.dropna()

In [16]:
# Verify that there are no more null values

In [17]:
null_counts_after = df_cleaned.isnull().sum()
print('Null values in each column after dropping:')
print(null_counts_after)

Null values in each column after dropping:
CustomerID         0
InteractionID      0
InteractionDate    0
Platform           0
InteractionType    0
Sentiment          0
dtype: int64


In [18]:
# Check data types

In [19]:
print('Data types before conversion:')
print(df_cleaned.dtypes)

Data types before conversion:
CustomerID         object
InteractionID      object
InteractionDate    object
Platform           object
InteractionType    object
Sentiment          object
dtype: object


In [20]:
# Convert 'InteractionDate' to datetime

In [21]:
df_cleaned['InteractionDate'] = pd.to_datetime(df_cleaned['InteractionDate'], format='%d/%m/%Y', errors='coerce')

In [22]:
# verify data types

In [23]:
print('Data types before conversion:')
print(df_cleaned.dtypes)

Data types before conversion:
CustomerID                 object
InteractionID              object
InteractionDate    datetime64[ns]
Platform                   object
InteractionType            object
Sentiment                  object
dtype: object


In [24]:
# Save the cleaned DataFrame to a new CSV file

In [25]:
df_cleaned.to_csv('social_media_interactions_cleaned.csv', index=False)