 
  <h1><center><font size=10>Google Query</center></font></h1>
  <h1><center>Exploratory Data Analysis - Google Query Case Study</center></h1>

<p>This data analysis delves into Google search data by DMA. By examining this data, we aim to uncover insights into:
</p>
<ol>
    <li>
        <p>Searches for NBA team-related terms.</p>
    </li>
    <li>
        <p>The relationship between NBA teams' records (winning percentage) and searches for the teams.</p>
    </li>
    
</ol>

In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import requests
from pathlib import Path

# Libraries to help with data visualization
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [31]:
# Study data files
google_query_path = "data/Google_Bigquery_Top_Search_Week_Ending_2024-06-02.csv"
google_query_top_search_path = "data/Google_Bigquery_Top_Search_Jan_Jun_2024.csv"

google_query_data = pd.read_csv(google_query_path)
google_query_top_search_data = pd.read_csv(google_query_top_search_path)

# Read the CSV into a Pandas DataFrame
google_queryDF = pd.merge(google_query_data, google_query_top_search_data, how='outer', on='dma_id')

# Display the data table for preview
google_queryDF.head()

Unnamed: 0,week_x,score_x,rank_x,refresh_date_x,dma_name_x,dma_id,term_x,week_y,score_y,rank_y,refresh_date_y,dma_name_y,term_y
0,2024-05-26,100.0,13,2024-05-30,Charlotte NC,517,All Eyes on Rafah meaning,2024-01-07,,16,2024-05-15,Charlotte NC,Alice Munro
1,2024-05-26,100.0,13,2024-05-30,Charlotte NC,517,All Eyes on Rafah meaning,2024-02-25,,16,2024-05-15,Charlotte NC,Alice Munro
2,2024-05-26,100.0,13,2024-05-30,Charlotte NC,517,All Eyes on Rafah meaning,2024-03-17,,16,2024-05-15,Charlotte NC,Alice Munro
3,2024-05-26,100.0,13,2024-05-30,Charlotte NC,517,All Eyes on Rafah meaning,2024-03-24,,16,2024-05-15,Charlotte NC,Alice Munro
4,2024-05-26,100.0,13,2024-05-30,Charlotte NC,517,All Eyes on Rafah meaning,2024-04-21,,16,2024-05-15,Charlotte NC,Alice Munro


In [32]:
# Print the last five rows of data to the screen
google_queryDF.tail()

Unnamed: 0,week_x,score_x,rank_x,refresh_date_x,dma_name_x,dma_id,term_x,week_y,score_y,rank_y,refresh_date_y,dma_name_y,term_y
54392959,2024-05-26,2.0,1,2024-06-03,Gainesville FL,592,GME stock,2024-03-17,10.0,24,2024-05-13,Gainesville FL,Celtics
54392960,2024-05-26,2.0,1,2024-06-03,Gainesville FL,592,GME stock,2024-03-24,10.0,24,2024-05-13,Gainesville FL,Celtics
54392961,2024-05-26,2.0,1,2024-06-03,Gainesville FL,592,GME stock,2024-04-07,,24,2024-05-13,Gainesville FL,Celtics
54392962,2024-05-26,2.0,1,2024-06-03,Gainesville FL,592,GME stock,2024-04-28,20.0,24,2024-05-13,Gainesville FL,Celtics
54392963,2024-05-26,2.0,1,2024-06-03,Gainesville FL,592,GME stock,2024-05-05,21.0,24,2024-05-13,Gainesville FL,Celtics


In [37]:
# Convert 'week' column to datetime format
google_queryDF['week_x'] = pd.to_datetime(google_queryDF['week_x'])
google_queryDF['week_y'] = pd.to_datetime(google_queryDF['week_y'])

google_queryDF

# Group by month, term. and DMA and aggregate the data
monthly_data = google_queryDF.groupby(['term_x', 'dma_name_x', google_queryDF['week_x'].dt.to_period('M')]).agg({
    'score_x': 'mean',
    'rank_x': 'mean',
    'refresh_date_x': 'max',  # Get the latest refresh_date in the month
    'dma_id': 'first',       # Get the first dma_id in the month
  }).reset_index()

monthly_data.rename(columns={'week_x': 'month_x'}, inplace=True)
monthly_data.rename(columns={'week_y': 'month_y'}, inplace=True)

# Display the monthly data
print(monthly_data)
monthly_data.tail()


         term_x                          dma_name_x  month_x  score_x  rank_x  \
0      ABC News               Abilene-Sweetwater TX  2024-05      8.0    16.0   
1      ABC News                           Albany GA  2024-05      NaN    16.0   
2      ABC News          Albany-Schenectady-Troy NY  2024-05     13.0    16.0   
3      ABC News             Albuquerque-Santa Fe NM  2024-05     10.0    16.0   
4      ABC News                       Alexandria LA  2024-05    100.0    16.0   
...         ...                                 ...      ...      ...     ...   
65515    Wolves                       Wilmington NC  2024-05     80.0    10.0   
65516    Wolves  Yakima-Pasco-Richland-Kennewick WA  2024-05     75.0    10.0   
65517    Wolves                       Youngstown OH  2024-05     82.0    10.0   
65518    Wolves                Yuma AZ-El Centro CA  2024-05     37.0    10.0   
65519    Wolves                       Zanesville OH  2024-05      9.0    10.0   

      refresh_date_x  dma_i

Unnamed: 0,term_x,dma_name_x,month_x,score_x,rank_x,refresh_date_x,dma_id
65515,Wolves,Wilmington NC,2024-05,80.0,10.0,2024-05-30,550
65516,Wolves,Yakima-Pasco-Richland-Kennewick WA,2024-05,75.0,10.0,2024-05-30,810
65517,Wolves,Youngstown OH,2024-05,82.0,10.0,2024-05-30,536
65518,Wolves,Yuma AZ-El Centro CA,2024-05,37.0,10.0,2024-05-30,771
65519,Wolves,Zanesville OH,2024-05,9.0,10.0,2024-05-30,596


In [39]:
# Import the data file containing NBA teams' winning percentages, by month
NBA_team_path = "data/NBA 23-24 Records by Team, by Month - Merge Candidate.csv"

NBA_team_data = pd.read_csv(NBA_team_path)

# Read the CSV into a Pandas DataFrame
NBA_recordsDF = pd.DataFrame(NBA_team_data)

# Display the data table for preview
NBA_recordsDF.head(30)

# Unpivot the data to create row for each month for each team, 210 (= 30 teams x 7 months) rows total
unpivoted_data = NBA_recordsDF.melt(id_vars='Team', var_name='month', value_name='value')
pivoted_data=unpivoted_data.sort_values(by=["Team", "month"],ascending=True)

# Display the unpivoted data
print(pivoted_data)

                   Team    month  value
20        Atlanta Hawks  2023-11  0.500
50        Atlanta Hawks  2023-12  0.500
80        Atlanta Hawks  2024-01  0.406
110       Atlanta Hawks  2024-02  0.426
140       Atlanta Hawks  2024-03  0.441
..                  ...      ...    ...
88   Washington Wizards  2024-01  0.188
118  Washington Wizards  2024-02  0.191
148  Washington Wizards  2024-03  0.153
178  Washington Wizards  2024-04  0.187
208  Washington Wizards  2024-05  0.183

[210 rows x 3 columns]


In [42]:
# Calculate the total count per DMA
dma_counts_df = google_queryDF.groupby('dma_name_x')['dma_name_x'].count()

# Display results
dma_counts_df

# Calculate the total count per weej
week_counts_df_x = google_queryDF.groupby('week_x')['week_x'].count()
week_counts_df_y = google_queryDF.groupby('week_y')['week_y'].count()

# Display results
week_counts_df_x
week_counts_df_y

# Calculate the total count per rank
rank_counts_df_x = google_queryDF.groupby('rank_x')['rank_x'].count()
rank_counts_df_y = google_queryDF.groupby('rank_y')['rank_y'].count()

# Display results
rank_counts_df_x
rank_counts_df_y

# Calculate the total count per rank
filtered_df = google_queryDF[google_queryDF['rank_x'].isin([1,2,3,4,5])]
term_counts = filtered_df['term_x'].value_counts()

#value_counts().reset_index().rename(columns={'index': 'term'}) 
top_ranks_df = term_counts[term_counts == term_counts.max()]
#value_counts().reset_index().rename(columns={'index': 'term'}) 

# Display results
top_5s_df=filtered_df[filtered_df['term_x'].isin(top_ranks_df.index)]

top_5s_df.head(20)


Unnamed: 0,week_x,score_x,rank_x,refresh_date_x,dma_name_x,dma_id,term_x,week_y,score_y,rank_y,refresh_date_y,dma_name_y,term_y
13075,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-01-07,,16,2024-05-15,Charlotte NC,Alice Munro
13076,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-02-25,,16,2024-05-15,Charlotte NC,Alice Munro
13077,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-03-17,,16,2024-05-15,Charlotte NC,Alice Munro
13078,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-03-24,,16,2024-05-15,Charlotte NC,Alice Munro
13079,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-04-21,,16,2024-05-15,Charlotte NC,Alice Munro
13080,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-04-28,,16,2024-05-15,Charlotte NC,Alice Munro
13081,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-05-05,,16,2024-05-15,Charlotte NC,Alice Munro
13082,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-01-14,,23,2024-05-15,Charlotte NC,Angela Alsobrooks
13083,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-01-28,,23,2024-05-15,Charlotte NC,Angela Alsobrooks
13084,2024-05-26,80.0,2,2024-06-01,Charlotte NC,517,Real Madrid,2024-02-11,,23,2024-05-15,Charlotte NC,Angela Alsobrooks
