# Assignment 2

## Libraries

In [15]:
import pandas as pd
import numpy as np
import networkx as nx

## Functions

### Combine data

In [8]:
def combine_data(dataframes_list: list[pd.DataFrame]) -> pd.DataFrame:
    
    
    # Required columns
    required_columns = ['nodeUserID', 'nodeTime', 'videoID', 'platform']
    
    # Process each dataframe to keep only required columns
    processed_dfs = []
    for i, df in enumerate(dataframes_list):
        # Check if all required columns exist
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"DataFrame at index {i} is missing required columns: {missing_cols}")
        
        # Select only the required columns
        processed_df = df[required_columns].copy()
        processed_dfs.append(processed_df)
    
    # Concatenate all processed dataframes
    combined_df = pd.concat(processed_dfs, ignore_index=True)        
    
    return combined_df

### Shuffle time

In [10]:
def shuffle_time(df: pd.DataFrame, random_state: int = None) -> pd.DataFrame:
    # Make a copy to avoid modifying the original dataframe
    df_shuffled = df.copy()
    
    
    # Shuffle the nodeTime column
    if random_state is not None:
        np.random.seed(random_state)
    df_shuffled['nodeTime'] = np.random.permutation(df_shuffled['nodeTime'].values)
    
    return df_shuffled

### Build network

In [16]:
def build_network(df: pd.DataFrame) -> nx.Graph:
    
    # Initialize a new graph
    G = nx.Graph()
    
    # Add nodes for each unique user with platform attribute
    users = df[['nodeUserID', 'platform']].drop_duplicates()
    for _, row in users.iterrows():
        user_id = f"user_{row['nodeUserID']}"
        G.add_node(user_id, bipartite=0, platform=row['platform'], node_type='user')
    
    # Add nodes for each unique video
    videos = df['videoID'].unique()
    for video_id in videos:
        video_node = f"video_{video_id}"
        G.add_node(video_node, bipartite=1, node_type='video')
    
    # Add edges between users and videos
    for _, row in df.iterrows():
        user_node = f"user_{row['nodeUserID']}"
        video_node = f"video_{row['videoID']}"
        G.add_edge(user_node, video_node, timestamp=row['nodeTime'])
    
    return G

## Loading Data

In [2]:
dfFacebook = pd.read_csv('data/facebook_cross_platform.csv')
dfFacebook.head()


Unnamed: 0,nodeID,nodeUserID,nodeTime,actionType,videoID,platform
0,548542|1619699981462385,548542,2018-04-01 10:22:08,post,VLMOHhKkrX8,facebook
1,5465518|1679734395397774,5465518,2018-04-05 00:04:00,post,OBkn78q_t_Q,facebook
2,6119363|1782014748487422,6119363,2018-04-05 07:12:02,post,3wj4ncIEDxw,facebook
3,8621215|1596340410441820,8621215,2018-04-05 18:22:26,post,3wj4ncIEDxw,facebook
4,12041680|2070852329860194,12041680,2018-04-08 15:08:50,post,mvD7qhDwljs,facebook


In [3]:
dfReddit = pd.read_csv('data/reddit_cross_platform.csv')
dfReddit.head()

Unnamed: 0,nodeID,nodeUserID,nodeTime,actionType,videoID,platform
0,dwzhjuz,ThirtyAxes,2018-04-07 21:28:29,comment,8LnVHw96jFs,reddit
1,dwznuk7,ThirtyAxes,2018-04-07 23:28:10,comment,8LnVHw96jFs,reddit
2,dwzvxo1,DiogenesHoSinopeus,2018-04-08 02:01:49,comment,8LnVHw96jFs,reddit
3,dwzwc3c,avivi_,2018-04-08 02:09:43,comment,OI3-0xIcvgI,reddit
4,dx0c36e,barkologix,2018-04-08 08:50:12,comment,R6utDs1b_TU,reddit


In [4]:
dfTwitter = pd.read_csv("data/twitter_cross_platform.csv")
dfTwitter.head()

Unnamed: 0,nodeID,nodeUserID,nodeTime,actionType,videoID,platform
0,NRFEnex8MF21dytDg9vfAg,ff0vicGZco1mZzYxUCWiGg,2018-04-01 02:13:00,tweet,6h0VDhENotI,twitter
1,Qeyxwl00d0vgLfKh1YqUEw,MCUuXxcepDc2FFRh9i-AtQ,2018-04-01 03:34:41,tweet,NQL3rX6xWRg,twitter
2,1QIQqg23ItMfiKbFFrPkAw,5hBVs8jS6_vCAFsHOjvKcA,2018-04-01 10:23:20,tweet,VLMOHhKkrX8,twitter
3,iYMBH3KdJvOiUHYxeKfx_w,qhQLCsDQNEtptuJJzfEVZA,2018-04-01 11:13:02,tweet,VLMOHhKkrX8,twitter
4,G86DyDFas0zLKkBFW0GWwg,lSmQYvUcuGn6oFyjGJduDw,2018-04-01 13:43:07,tweet,4oQTWn1JfeA,twitter


In [5]:
dfYoutube = pd.read_csv("data/youtube_cross_platform.csv")
dfYoutube.head()

Unnamed: 0,videoID,video_channel,nodeTime
0,Zd6bAEMu5Yk,twotwo30Productions,2006-09-22 21:10:44
1,VU3RHNLzh-I,معاٌ لدعم الدفاع المدني السوري,2014-06-19 10:29:28
2,tdFHNE8WxOA,ZFront Kharkov UA,2014-07-08 14:28:21
3,12irnW4FNFY,ODN,2014-07-12 10:25:05
4,6h0VDhENotI,The Syria Campaign,2014-08-27 12:33:36


In [9]:
dfCombined = combine_data(dataframes_list = [dfFacebook, dfReddit, dfTwitter])
dfCombined.head()

Unnamed: 0,nodeUserID,nodeTime,videoID,platform
0,548542,2018-04-01 10:22:08,VLMOHhKkrX8,facebook
1,5465518,2018-04-05 00:04:00,OBkn78q_t_Q,facebook
2,6119363,2018-04-05 07:12:02,3wj4ncIEDxw,facebook
3,8621215,2018-04-05 18:22:26,3wj4ncIEDxw,facebook
4,12041680,2018-04-08 15:08:50,mvD7qhDwljs,facebook


In [13]:
dfCombined = shuffle_time(dfCombined, random_state=42)
dfCombined.head()

Unnamed: 0,nodeUserID,nodeTime,videoID,platform
0,548542,2018-04-30 00:42:38,VLMOHhKkrX8,facebook
1,5465518,2018-10-13 20:45:20,OBkn78q_t_Q,facebook
2,6119363,2018-05-09 02:19:31,3wj4ncIEDxw,facebook
3,8621215,2018-04-18 21:12:15,3wj4ncIEDxw,facebook
4,12041680,2018-04-07 23:59:02,mvD7qhDwljs,facebook


In [17]:
G_combined = build_network(dfCombined)

In [19]:
nx.write_gexf(G_combined, 'social_data.gexf')