In [13]:
from google.cloud import storage
import json
import pandas as pd

# Initialize the Google Cloud Storage client
client = storage.Client()

# Define your bucket and paths
bucket_name = 'msca-bdp-student-gcs'
file_paths = [
    "Group5/Steam_data/steam_data1.json",
    "Group5/Steam_data/steam_data2.json",
    "Group5/Steam_data/steamspy_data1.json",
    "Group5/Steam_data/steamspy_data2.json"
]

# Function to load JSON from GCS
def load_json_from_gcs(bucket_name, file_path):
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(file_path)
    json_data = json.loads(blob.download_as_text())
    return json_data

# Load each JSON file
data1 = load_json_from_gcs(bucket_name, file_paths[0])
data2 = load_json_from_gcs(bucket_name, file_paths[1])
data3 = load_json_from_gcs(bucket_name, file_paths[2])
data4 = load_json_from_gcs(bucket_name, file_paths[3])

In [14]:
# Print the first key-value pair if data1 is a dictionary
if isinstance(data1, dict):
    first_key = list(data1.keys())[0]
    print(json.dumps({first_key: data1[first_key]}, indent=2))  # Pretty-print the first key-value pair
else:
    print("data1 is not a dictionary.")


{
  "10": {
    "type": "game",
    "name": "Counter-Strike",
    "steam_appid": 10,
    "required_age": 0,
    "is_free": false,
    "detailed_description": "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.",
    "about_the_game": "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.",
    "short_description": "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with

In [15]:
filtered_data = []

for app_id, details in data1.items():
    filtered_data.append({
        "name": details.get("name"),
        "steam_appid": details.get("steam_appid"),
        "is_free": details.get("is_free"),
        "platforms": details.get("platforms"),
        "metacritic": details.get("metacritic"),
        "recommendations": details.get("recommendations", {}).get("total"),
        "release_date": details.get("release_date", {}).get("date")
    })

# Convert to DataFrame for better readability and CSV export if needed
df = pd.DataFrame(filtered_data)

platforms_df = df['platforms'].apply(pd.Series)
df = pd.concat([df.drop(columns=['platforms']), platforms_df], axis=1)

# Handle cases where 'metacritic' is None by filling with empty dictionaries
metacritic_df = df['metacritic'].apply(lambda x: pd.Series(x) if isinstance(x, dict) else pd.Series({"score": None, "url": None}))
df = pd.concat([df.drop(columns=['metacritic']), metacritic_df], axis=1)

df=df.drop(columns=['url'])

df.head(10)

Unnamed: 0,name,steam_appid,is_free,recommendations,release_date,windows,mac,linux,score
0,Counter-Strike,10,False,153520.0,"Nov 1, 2000",True,True,True,88.0
1,Counter-Strike: Condition Zero,80,False,18927.0,"Mar 1, 2004",True,True,True,65.0
2,Crown Trick,1000010,False,4342.0,"Oct 16, 2020",True,False,False,83.0
3,"Cook, Serve, Delicious! 3?!",1000030,False,1616.0,"Oct 14, 2020",True,True,False,79.0
4,Zengeon,1000080,False,1187.0,"Jun 24, 2019",True,True,False,
5,Tower of Origin2-Worm's Nest,1000280,False,,"Sep 9, 2021",True,False,False,
6,人气动漫大乱斗,1000310,True,,"Dec 17, 2019",True,False,False,
7,Hellish Quart,1000360,False,6361.0,"Feb 16, 2021",True,False,False,
8,Rogue Reaper,1000380,True,,"Feb 1, 2019",True,False,False,
9,WRATH: Aeon of Ruin,1000410,False,1604.0,"Feb 27, 2024",True,False,True,71.0


In [16]:
filtered_data2 = []

for app_id, details in data2.items():
    filtered_data2.append({
        "name": details.get("name"),
        "steam_appid": details.get("steam_appid"),
        "is_free": details.get("is_free"),
        "platforms": details.get("platforms"),
        "metacritic": details.get("metacritic"),
        "recommendations": details.get("recommendations", {}).get("total"),
        "release_date": details.get("release_date", {}).get("date")
    })

# Convert to DataFrame for better readability and CSV export if needed
df2 = pd.DataFrame(filtered_data2)

platforms_df2 = df2['platforms'].apply(pd.Series)
df2 = pd.concat([df2.drop(columns=['platforms']), platforms_df2], axis=1)

# Handle cases where 'metacritic' is None by filling with empty dictionaries
metacritic_df2 = df2['metacritic'].apply(lambda x: pd.Series(x) if isinstance(x, dict) else pd.Series({"score": None, "url": None}))
df2 = pd.concat([df2.drop(columns=['metacritic']), metacritic_df], axis=1)

df2=df2.drop(columns=['url'])

df2.head(10)

Unnamed: 0,name,steam_appid,is_free,recommendations,release_date,windows,mac,linux,score
0,Civilization V - Civ and Scenario Pack: Polynesia,99610,False,,"Mar 3, 2011",True,True,True,88.0
1,Civilization V - Civ and Scenario Pack: Denmar...,99611,False,,"May 3, 2011",True,True,True,65.0
2,Civilization V - Civ and Scenario Pack: Korea,99612,False,205.0,"Aug 11, 2011",True,True,True,83.0
3,Civilization V - Scenario Pack: Wonders of the...,99614,False,,"Aug 11, 2011",True,True,True,79.0
4,Quest for the Golden Duck,996160,False,,"Jan 23, 2019",True,True,True,
5,Buttle Tank,996280,False,,"Dec 22, 2018",True,False,False,
6,Smartphone Tycoon,996380,False,1584.0,"Mar 1, 2019",True,False,False,
7,Spyro™ Reignited Trilogy,996580,False,11824.0,"Sep 3, 2019",True,False,False,
8,Moving Out,996770,False,1527.0,"Apr 28, 2020",True,False,False,
9,Banner of the Maid - Miss Elisa's Journal,996820,True,,"Aug 28, 2019",True,False,False,71.0


In [17]:
df_steam = pd.concat([df, df2], ignore_index=True)
df_steam.head()

Unnamed: 0,name,steam_appid,is_free,recommendations,release_date,windows,mac,linux,score
0,Counter-Strike,10,False,153520.0,"Nov 1, 2000",True,True,True,88.0
1,Counter-Strike: Condition Zero,80,False,18927.0,"Mar 1, 2004",True,True,True,65.0
2,Crown Trick,1000010,False,4342.0,"Oct 16, 2020",True,False,False,83.0
3,"Cook, Serve, Delicious! 3?!",1000030,False,1616.0,"Oct 14, 2020",True,True,False,79.0
4,Zengeon,1000080,False,1187.0,"Jun 24, 2019",True,True,False,


In [18]:
print(df.shape)
print(df2.shape)
print(df_steam.shape)

(4937, 9)
(21051, 9)
(25988, 9)


In [19]:
# Print the first key-value pair if data1 is a dictionary
if isinstance(data3, dict):
    first_key = list(data3.keys())[0]
    print(json.dumps({first_key: data3[first_key]}, indent=2))  # Pretty-print the first key-value pair
else:
    print("data3 is not a dictionary.")


{
  "10": {
    "appid": 10,
    "name": "Counter-Strike",
    "developer": "Valve",
    "publisher": "Valve",
    "score_rank": "",
    "positive": 235682,
    "negative": 6218,
    "userscore": 0,
    "owners": "10,000,000 .. 20,000,000",
    "average_forever": 8920,
    "average_2weeks": 7,
    "median_forever": 174,
    "median_2weeks": 10,
    "price": "999",
    "initialprice": "999",
    "discount": "0",
    "ccu": 14002,
    "languages": "English, French, German, Italian, Spanish - Spain, Simplified Chinese, Traditional Chinese, Korean",
    "genre": "Action",
    "tags": {
      "Action": 5480,
      "FPS": 4907,
      "Multiplayer": 3454,
      "Shooter": 3407,
      "Classic": 2830,
      "Team-Based": 1904,
      "First-Person": 1746,
      "Competitive": 1639,
      "Tactical": 1378,
      "1990's": 1234,
      "e-sports": 1221,
      "PvP": 914,
      "Old School": 812,
      "Military": 656,
      "Strategy": 629,
      "Survival": 314,
      "Score Attack": 296,
      "

In [20]:
data3 = {app_id: details for app_id, details in data3.items() if details is not None}

for app_id, details in data3.items():
    details.pop("score_rank", None)
    details.pop("userscore", None)

df3 = pd.DataFrame.from_dict(data3, orient='index')
 
languages_dummies = df3['languages'].str.get_dummies(sep=', ')

genres_dummies = df3['genre'].str.get_dummies(sep=', ').add_prefix('genre_')

tags_dummies = df3['tags'].apply(pd.Series).fillna(0).add_prefix('tag_') 

df3 = pd.concat([df3.drop(columns=['languages', 'genre', 'tags']), languages_dummies, genres_dummies, tags_dummies], axis=1)


df3.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,...,tag_Well-Written,tag_Werewolves,tag_Western,tag_Wholesome,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_e-sports
10,10,Counter-Strike,Valve,Valve,235682,6218,"10,000,000 .. 20,000,000",8920,7,174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1221.0
100,100,Counter-Strike: Condition Zero,Valve,Valve,13442,1535,"10,000,000 .. 20,000,000",131,0,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000010,1000010,Crown Trick,NEXT Studios,"Team17, NEXT Studios",4414,739,"500,000 .. 1,000,000",285,0,179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000030,1000030,"Cook, Serve, Delicious! 3?!",Vertigo Gaming Inc.,Vertigo Gaming Inc.,1904,184,"100,000 .. 200,000",25,0,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000080,1000080,Zengeon,IndieLeague Studio,2P Games,1015,498,"100,000 .. 200,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Print the first key-value pair if data1 is a dictionary
if isinstance(data4, dict):
    first_key = list(data4.keys())[0]
    print(json.dumps({first_key: data4[first_key]}, indent=2))  # Pretty-print the first key-value pair
else:
    print("data3 is not a dictionary.")

{
  "99610": {
    "appid": 99610,
    "name": "Civilization V - Civ and Scenario Pack: Polynesia",
    "developer": "Firaxis Games, Aspyr (Mac), Aspyr (Linux)",
    "publisher": "2K, Aspyr (Mac), Aspyr (Linux)",
    "score_rank": "",
    "positive": 70,
    "negative": 14,
    "userscore": 0,
    "owners": "0 .. 20,000",
    "average_forever": 0,
    "average_2weeks": 0,
    "median_forever": 0,
    "median_2weeks": 0,
    "price": "499",
    "initialprice": "499",
    "discount": "0",
    "ccu": 0,
    "languages": "English, French, German, Italian, Polish, Spanish - Spain, Korean",
    "genre": "Strategy",
    "tags": {
      "Strategy": 26
    }
  }
}


In [22]:
data4 = {app_id: details for app_id, details in data4.items() if details is not None}

for app_id, details in data4.items():
    details.pop("score_rank", None)
    details.pop("userscore", None)

df4 = pd.DataFrame.from_dict(data4, orient='index')
 
languages_dummies = df4['languages'].str.get_dummies(sep=', ')

genres_dummies = df4['genre'].str.get_dummies(sep=', ').add_prefix('genre_')

tags_dummies = df4['tags'].apply(pd.Series).fillna(0).add_prefix('tag_')

df4 = pd.concat([df4.drop(columns=['languages', 'genre', 'tags']), languages_dummies, genres_dummies, tags_dummies], axis=1)


df4.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,...,tag_Well-Written,tag_Werewolves,tag_Western,tag_Wholesome,tag_Word Game,tag_World War I,tag_World War II,tag_Wrestling,tag_Zombies,tag_e-sports
99610,99610,Civilization V - Civ and Scenario Pack: Polynesia,"Firaxis Games, Aspyr (Mac), Aspyr (Linux)","2K, Aspyr (Mac), Aspyr (Linux)",70,14,"0 .. 20,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99611,99611,Civilization V - Civ and Scenario Pack: Denmar...,"Firaxis Games, Aspyr (Mac), Aspyr (Linux)","2K, Aspyr (Mac), Aspyr (Linux)",81,12,"0 .. 20,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99612,99612,Civilization V - Civ and Scenario Pack: Korea,"Firaxis Games, Aspyr (Mac), Aspyr (Linux)","2K, Aspyr (Mac), Aspyr (Linux)",131,85,"0 .. 20,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99614,99614,Civilization V - Scenario Pack: Wonders of the...,"Firaxis Games, Aspyr (Mac), Aspyr (Linux)","2K, Aspyr (Mac), Aspyr (Linux)",54,14,"0 .. 20,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996160,996160,Quest for the Golden Duck,Bigosaur,Bigosaur,14,4,"0 .. 20,000",0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Reset the index to ensure there are no index conflicts and use ignore_index in concatenation
df3 = df3.reset_index(drop=True)
df4 = df4.reset_index(drop=True)

# Concatenate with ignore_index=True to avoid index conflicts
df_steamSPY = pd.concat([df3, df4], axis=0, sort=False, ignore_index=True).fillna(0)

# Optionally, convert NaNs to integers where appropriate
df_steamSPY = df_steamSPY.astype(int, errors='ignore')

df_steamSPY.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,...,tag_Batman,tag_Birds,tag_Bowling,tag_Extraction Shooter,tag_Jump Scare,tag_Lemmings,tag_Philisophical,tag_Snooker,tag_Star Wars,tag_Steam Machine
0,10,Counter-Strike,Valve,Valve,235682,6218,"10,000,000 .. 20,000,000",8920,7,174,...,0,0,0,0,0,0,0,0,0,0
1,100,Counter-Strike: Condition Zero,Valve,Valve,13442,1535,"10,000,000 .. 20,000,000",131,0,22,...,0,0,0,0,0,0,0,0,0,0
2,1000010,Crown Trick,NEXT Studios,"Team17, NEXT Studios",4414,739,"500,000 .. 1,000,000",285,0,179,...,0,0,0,0,0,0,0,0,0,0
3,1000030,"Cook, Serve, Delicious! 3?!",Vertigo Gaming Inc.,Vertigo Gaming Inc.,1904,184,"100,000 .. 200,000",25,0,24,...,0,0,0,0,0,0,0,0,0,0
4,1000080,Zengeon,IndieLeague Studio,2P Games,1015,498,"100,000 .. 200,000",0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
print(df3.shape)
print(df4.shape)
print(df_steamSPY.shape)


(5000, 540)
(21714, 581)
(26714, 584)


In [26]:
df_steam=df_steam.drop(columns=['name'])
df_steam

Unnamed: 0,steam_appid,is_free,recommendations,release_date,windows,mac,linux,score
0,10,False,153520.0,"Nov 1, 2000",True,True,True,88.0
1,80,False,18927.0,"Mar 1, 2004",True,True,True,65.0
2,1000010,False,4342.0,"Oct 16, 2020",True,False,False,83.0
3,1000030,False,1616.0,"Oct 14, 2020",True,True,False,79.0
4,1000080,False,1187.0,"Jun 24, 2019",True,True,False,
...,...,...,...,...,...,...,...,...
25983,506840,False,,"Aug 7, 2016",True,False,False,
25984,506870,False,547.0,"Feb 22, 2017",True,True,True,
25985,506900,False,478.0,"Jul 13, 2017",True,False,False,
25986,506920,True,,"Aug 2, 2016",True,True,True,


In [30]:
df_all=pd.merge(df_steamSPY,df_steam,left_on='appid',right_on='steam_appid',how='inner')
df_all=df_all.drop(columns=['steam_appid'])
df_all

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,...,tag_Snooker,tag_Star Wars,tag_Steam Machine,is_free,recommendations,release_date,windows,mac,linux,score
0,10,Counter-Strike,Valve,Valve,235682,6218,"10,000,000 .. 20,000,000",8920,7,174,...,0,0,0,False,153520.0,"Nov 1, 2000",True,True,True,88.0
1,1000010,Crown Trick,NEXT Studios,"Team17, NEXT Studios",4414,739,"500,000 .. 1,000,000",285,0,179,...,0,0,0,False,4342.0,"Oct 16, 2020",True,False,False,83.0
2,1000030,"Cook, Serve, Delicious! 3?!",Vertigo Gaming Inc.,Vertigo Gaming Inc.,1904,184,"100,000 .. 200,000",25,0,24,...,0,0,0,False,1616.0,"Oct 14, 2020",True,True,False,79.0
3,1000080,Zengeon,IndieLeague Studio,2P Games,1015,498,"100,000 .. 200,000",0,0,0,...,0,0,0,False,1187.0,"Jun 24, 2019",True,True,False,
4,1000280,Tower of Origin2-Worm's Nest,Villain Role,Villain Role,35,16,"0 .. 20,000",0,0,0,...,0,0,0,False,,"Sep 9, 2021",True,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25982,506840,The Dungeons of Castle Madness,Third Impression,Third Impression,14,11,"20,000 .. 50,000",0,0,0,...,0,0,0,False,,"Aug 7, 2016",True,False,False,
25983,506870,Unexplored,Ludomotion,Ludomotion,598,103,"100,000 .. 200,000",0,0,0,...,0,0,0,False,547.0,"Feb 22, 2017",True,True,True,
25984,506900,Downward: Enhanced Edition,Caracal Games,Plug In Digital,379,151,"50,000 .. 100,000",0,0,0,...,0,0,0,False,478.0,"Jul 13, 2017",True,False,False,
25985,506920,Tricky Towers - Original Soundtrack,WeirdBeard,,10,0,"0 .. 20,000",0,0,0,...,0,0,0,True,,"Aug 2, 2016",True,True,True,


In [31]:
df_all.shape

(25987, 591)

In [32]:
from google.cloud import storage

# Save the DataFrame to a CSV file locally
output_file = "steam_data_all.csv"
df_all.to_csv(output_file, index=False)

# Initialize Google Cloud Storage client
client = storage.Client()

# Define your bucket name and destination file path in the bucket
bucket_name = 'msca-bdp-student-gcs'
destination_blob_name = "Group5/Steam_data/steam_data_all.csv" 

# Upload the file to the bucket
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(output_file)

print(f"File successfully uploaded to gs://{bucket_name}/{destination_blob_name}")


File successfully uploaded to gs://msca-bdp-student-gcs/Group5/Steam_data/steam_data_all.csv


In [2]:
import pandas as pd

# Path to the CSV file in GCS
csv_path = "gs://msca-bdp-student-gcs/Group5/Steam_data/steam_data_all.csv"

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_path)

# Perform primary data investigation
# Display basic information about the dataset
print("Basic Information:")
print(df.info())

# Display the first few rows of the dataset
print("\nFirst Few Rows:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Display basic statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
print(df.describe())

# Display basic statistics for categorical columns
print("\nSummary Statistics for Categorical Columns:")
print(df.describe(include=['object']))

# Display column names
print("\nColumn Names:")
print(df.columns.tolist())


Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25987 entries, 0 to 25986
Columns: 591 entries, appid to score
dtypes: bool(4), float64(2), int64(580), object(5)
memory usage: 116.5+ MB
None

First Few Rows:
     appid                          name            developer  \
0       10                Counter-Strike                Valve   
1  1000010                   Crown Trick         NEXT Studios   
2  1000030   Cook, Serve, Delicious! 3?!  Vertigo Gaming Inc.   
3  1000080                       Zengeon   IndieLeague Studio   
4  1000280  Tower of Origin2-Worm's Nest         Villain Role   

              publisher  positive  negative                    owners  \
0                 Valve    235682      6218  10,000,000 .. 20,000,000   
1  Team17, NEXT Studios      4414       739      500,000 .. 1,000,000   
2   Vertigo Gaming Inc.      1904       184        100,000 .. 200,000   
3              2P Games      1015       498        100,000 .. 200,000   
4          Vill

In [5]:
print("First Row (Structured View):")
print(df.iloc[0].to_string())  # Convert the first row to a string for better formatting


First Row (Structured View):
appid                                                                                 10
name                                                                      Counter-Strike
developer                                                                          Valve
publisher                                                                          Valve
positive                                                                          235682
negative                                                                            6218
owners                                                          10,000,000 .. 20,000,000
average_forever                                                                     8920
average_2weeks                                                                         7
median_forever                                                                       174
median_2weeks                                                                    