# Data Prep for Neo4j Import

## 1. Load Data, Clean NaN Values, and Remove Unnecessary Columns

### a. Scrap Position Group Keys from 247 Sports Website into dataframe

In [1]:
import neo4j
import pandas as pd
import numpy as np

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

In [2]:
from bs4 import BeautifulSoup

import requests

header = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}

url = requests.get("https://247sports.com/Position/?year=2010&sport=Football", headers=header).text

soup = BeautifulSoup(url, 'html.parser')
table = soup.find('table', {'class': 'position_tbl'})

df_pos = pd.read_html(str(table))[0]

In [3]:
df_pos

Unnamed: 0,Sport,Platoon,Position Group,Name,Abbreviation,Start Year,End Year,Default Name,Unnamed: 8
0,Football,Football (Offense),Quarterback,Pro-Style Quarterback,PRO,0,2020.0,Pro-Style Quarterback,EditDelete More ▾ Player PositionsPlayer Sport...
1,Football,Football (Offense),Quarterback,Dual-Threat Quarterback,DUAL,0,2020.0,Dual-Threat Quarterback,EditDelete More ▾ Player PositionsPlayer Sport...
2,Football,Football (Offense),Quarterback,Quarterback,QB,2021,,Quarterback,EditDelete More ▾ Player PositionsPlayer Sport...
3,Football,Football (Offense),Running Back,All Purpose Back,APB,0,2020.0,All Purpose Back,EditDelete More ▾ Player PositionsPlayer Sport...
4,Football,Football (Offense),Running Back,Running Back,RB,0,,Running Back,EditDelete More ▾ Player PositionsPlayer Sport...
5,Football,Football (Offense),Running Back,Fullback,FB,0,2020.0,Fullback,EditDelete More ▾ Player PositionsPlayer Sport...
6,Football,Football (Offense),Receiver,Wide Receiver,WR,0,,Wide Receiver,EditDelete More ▾ Player PositionsPlayer Sport...
7,Football,Football (Offense),Receiver,Tight End,TE,0,,Tight End,EditDelete More ▾ Player PositionsPlayer Sport...
8,Football,Football (Offense),Offensive Line,Offensive Tackle,OT,0,,Offensive Tackle,EditDelete More ▾ Player PositionsPlayer Sport...
9,Football,Football (Offense),Offensive Line,Offensive Guard,OG,0,2020.0,Offensive Guard,EditDelete More ▾ Player PositionsPlayer Sport...


In [4]:
df_pos = df_pos.drop(['Sport', 'Start Year','End Year','Default Name','Unnamed: 8'], axis=1)
df_pos.head()

Unnamed: 0,Platoon,Position Group,Name,Abbreviation
0,Football (Offense),Quarterback,Pro-Style Quarterback,PRO
1,Football (Offense),Quarterback,Dual-Threat Quarterback,DUAL
2,Football (Offense),Quarterback,Quarterback,QB
3,Football (Offense),Running Back,All Purpose Back,APB
4,Football (Offense),Running Back,Running Back,RB


In [5]:
df_pos.rename({'Platoon': 'type', 'Position Group': 'position_group', 'Name':'position_name',
           'Abbreviation': 'position'}, axis=1, inplace=True)

print(df_pos.shape)
df_pos.head()

(27, 4)


Unnamed: 0,type,position_group,position_name,position
0,Football (Offense),Quarterback,Pro-Style Quarterback,PRO
1,Football (Offense),Quarterback,Dual-Threat Quarterback,DUAL
2,Football (Offense),Quarterback,Quarterback,QB
3,Football (Offense),Running Back,All Purpose Back,APB
4,Football (Offense),Running Back,Running Back,RB


In [6]:
df_pos.loc[df_pos.type.str.contains('Offense'), 'type'] = 'Offense'
df_pos.loc[df_pos.type.str.contains('Defense'), 'type'] = 'Defense'
df_pos.loc[df_pos.type.str.contains('Special'), 'type'] = 'Special Teams'

df_pos.head()

Unnamed: 0,type,position_group,position_name,position
0,Offense,Quarterback,Pro-Style Quarterback,PRO
1,Offense,Quarterback,Dual-Threat Quarterback,DUAL
2,Offense,Quarterback,Quarterback,QB
3,Offense,Running Back,All Purpose Back,APB
4,Offense,Running Back,Running Back,RB


In [7]:
df_pos = df_pos[['position', 'position_name', 'position_group', 'type']]
df_pos.head()

Unnamed: 0,position,position_name,position_group,type
0,PRO,Pro-Style Quarterback,Quarterback,Offense
1,DUAL,Dual-Threat Quarterback,Quarterback,Offense
2,QB,Quarterback,Quarterback,Offense
3,APB,All Purpose Back,Running Back,Offense
4,RB,Running Back,Running Back,Offense


### b.) Load CFB Team Info

In [8]:
df_team = pd.read_csv("cfb_team_info_all.csv", engine='python')
print(df_team.shape)
df_team.head()

(1693, 27)


Unnamed: 0,ï»¿id,school,mascot,abbreviation,alt_name1,alt_name2,alt_name3,conference,division,color,...,location.zip,location.country_code,location.timezone,location.latitude,location.longitude,location.elevation,location.capacity,location.year_constructed,location.grass,location.dome
0,2000,Abilene Christian,Wildcats,ACU,,ACU,Abil Christian,,,#4e2683,...,,,,,,,,,,
1,2001,Adams State,Grizzlies,ADST,,ADST,Adams St,,,#000000,...,,,,,,,,,,
2,2003,Adrian,Bulldogs,ADR,,ADR,Adrian,,,#000000,...,,,,,,,,,,
3,2005,Air Force,Falcons,AFA,,AFA,Air Force,Mountain West,Mountain,#004a7b,...,80840.0,US,America/Denver,38.99697,-104.843617,2024.875732,46692.0,1962.0,False,False
4,2006,Akron,Zips,AKR,,AKR,Akron,Mid-American,East,#00285e,...,44399.0,US,America/New_York,41.072553,-81.508341,321.287506,30000.0,2009.0,False,False


In [9]:
df_team.dtypes

ï»¿id                          int64
school                        object
mascot                        object
abbreviation                  object
alt_name1                     object
alt_name2                     object
alt_name3                     object
conference                    object
division                      object
color                         object
alt_color                     object
logos[0]                      object
logos[1]                      object
location.venue_id            float64
location.name                 object
location.city                 object
location.state                object
location.zip                 float64
location.country_code         object
location.timezone             object
location.latitude            float64
location.longitude           float64
location.elevation           float64
location.capacity            float64
location.year_constructed    float64
location.grass                object
location.dome                 object
d

In [10]:
df_team2 = df_team[['school','conference','division','location.city','location.state','location.latitude','location.longitude']]
df_team2.head()

Unnamed: 0,school,conference,division,location.city,location.state,location.latitude,location.longitude
0,Abilene Christian,,,,,,
1,Adams State,,,,,,
2,Adrian,,,,,,
3,Air Force,Mountain West,Mountain,Colorado Springs,CO,38.99697,-104.843617
4,Akron,Mid-American,East,Akron,OH,41.072553,-81.508341


In [11]:
# rename columns for more accurate/clean headers

df_team2.rename({'school': 'college_commit', 'location.city': 'college_city', 'location.state':'college_state',
           'location.latitude': 'college_lat', 'location.longitude': 'college_long'}, axis=1, inplace=True)

print(df_team2.shape)
df_team2.head()

(1693, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
0,Abilene Christian,,,,,,
1,Adams State,,,,,,
2,Adrian,,,,,,
3,Air Force,Mountain West,Mountain,Colorado Springs,CO,38.99697,-104.843617
4,Akron,Mid-American,East,Akron,OH,41.072553,-81.508341


In [12]:
df_sjs = df_team2[df_team2['college_commit'].str.contains("San J")]
df_sjs

Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
527,San JosÃ© State,Mountain West,West,San Jose,CA,37.319668,-121.868296


In [13]:
df_team2.loc[df_team2.college_commit.str.contains('San J', na=False), 'college_commit'] = 'San José State'

df_team2.loc[df_team2.college_commit.str.contains('San J', na=False)].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
527,San José State,Mountain West,West,San Jose,CA,37.319668,-121.868296


In [14]:
# ensure all rows have values

df_team2.isnull().sum(axis = 0)

college_commit       0
conference        1563
division          1589
college_city      1481
college_state     1481
college_lat       1481
college_long      1481
dtype: int64

In [15]:
# drop rows containing NaN values for selected columns

df_team2.dropna(subset=['college_city','college_state', 'college_lat','college_long'], inplace=True)

print(df_team2.shape)
df_team2.head()

(212, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_team2.dropna(subset=['college_city','college_state', 'college_lat','college_long'], inplace=True)


Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
3,Air Force,Mountain West,Mountain,Colorado Springs,CO,38.99697,-104.843617
4,Akron,Mid-American,East,Akron,OH,41.072553,-81.508341
5,Alabama,SEC,West,Tuscaloosa,AL,33.208275,-87.550384
7,Alabama State,,,Montgomery,AL,32.37949,-86.293002
8,Albany,,,Albany,NY,42.680981,-73.827276


In [16]:
df_team2[['conference']] = df_team2[['conference']].fillna("FCS")
df_team2[['division']] = df_team2[['division']].fillna("fcs")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [17]:
# ensure all rows have values

df_team2.isnull().sum(axis = 0)

college_commit    0
conference        0
division          0
college_city      0
college_state     0
college_lat       0
college_long      0
dtype: int64

### Add in extra university location data for missing colleges

In [18]:
df_team_ex = pd.read_csv("cfb_team_info_extra.csv", engine='python')
print(df_team_ex.shape)
df_team_ex.head()

(60, 7)


Unnamed: 0,college_commit,conference,division,city,state,lat,long
0,Abilene Christian,FCS,FCS,Abilene,TX,-99.709797,32.468943
1,Alabama A&M,FCS,FCS,Normal,AL,-86.568502,34.783368
2,Brown,FCS,FCS,Providence,RI,-71.40385,41.82617
3,Butler,FCS,FCS,Indianapolis,IN,-86.172167,39.8374
4,Campbell,FCS,FCS,Buies Creek,NC,-78.738238,35.409149


In [19]:
# rename columns for more accurate/clean headers

df_team_ex.rename({'school': 'college_commit', 'city': 'college_city', 'state':'college_state',
           'lat': 'college_lat', 'long': 'college_long'}, axis=1, inplace=True)

print(df_team_ex.shape)
df_team_ex.head()

(60, 7)


Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
0,Abilene Christian,FCS,FCS,Abilene,TX,-99.709797,32.468943
1,Alabama A&M,FCS,FCS,Normal,AL,-86.568502,34.783368
2,Brown,FCS,FCS,Providence,RI,-71.40385,41.82617
3,Butler,FCS,FCS,Indianapolis,IN,-86.172167,39.8374
4,Campbell,FCS,FCS,Buies Creek,NC,-78.738238,35.409149


In [20]:
# ensure all rows have values

df_team_ex.isnull().sum(axis = 0)

college_commit    0
conference        0
division          0
college_city      0
college_state     0
college_lat       0
college_long      0
dtype: int64

In [21]:
# concat extra college data onto existing dataframe

df_team1 = pd.concat([df_team2, df_team_ex], ignore_index=True)
df_team1.head(10)

Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
0,Air Force,Mountain West,Mountain,Colorado Springs,CO,38.99697,-104.843617
1,Akron,Mid-American,East,Akron,OH,41.072553,-81.508341
2,Alabama,SEC,West,Tuscaloosa,AL,33.208275,-87.550384
3,Alabama State,FCS,fcs,Montgomery,AL,32.37949,-86.293002
4,Albany,FCS,fcs,Albany,NY,42.680981,-73.827276
5,Alcorn State,FCS,fcs,Lorman,MS,31.873621,-91.13488
6,Appalachian State,Sun Belt,fcs,Boone,NC,36.211427,-81.685428
7,Arizona,Pac-12,South,Tucson,AZ,32.228805,-110.948868
8,Arizona State,Pac-12,South,Tempe,AZ,33.426447,-111.9325
9,Arkansas,SEC,West,Fayetteville,AR,36.068066,-94.178953


In [22]:
# ensure all rows have values

df_team_ex.isnull().sum(axis = 0)

college_commit    0
conference        0
division          0
college_city      0
college_state     0
college_lat       0
college_long      0
dtype: int64

In [23]:
print(df_team1.shape)
df_team1.head()

(272, 7)


Unnamed: 0,college_commit,conference,division,college_city,college_state,college_lat,college_long
0,Air Force,Mountain West,Mountain,Colorado Springs,CO,38.99697,-104.843617
1,Akron,Mid-American,East,Akron,OH,41.072553,-81.508341
2,Alabama,SEC,West,Tuscaloosa,AL,33.208275,-87.550384
3,Alabama State,FCS,fcs,Montgomery,AL,32.37949,-86.293002
4,Albany,FCS,fcs,Albany,NY,42.680981,-73.827276


In [24]:
# check csv file for errors

df_team1.to_csv('cfb_team_check_V3.csv',encoding='utf-8-sig')

### c. Load CFB Player Recruiting Data

In [25]:
df = pd.read_csv("2015-2021 CFB Recruiting Players data.csv", engine='python')
print(df.shape)
df.head()

(27179, 19)


Unnamed: 0,id,athleteId,recruitType,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,country,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
0,66928,3915192.0,HighSchool,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,USA,31.578206,-84.155681,13095.0
1,31860,,HighSchool,2015,1.0,Trent Thompson,Westover,Georgia,DT,74.5,313.0,5,0.9991,Albany,GA,USA,31.578206,-84.155681,13095.0
2,31861,-1009710.0,HighSchool,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,USA,28.677968,-81.511521,12095.0
3,31862,3916922.0,HighSchool,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,USA,27.998541,-82.274884,12057.0
4,31863,3912545.0,HighSchool,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,USA,33.769016,-118.191605,6037.0


In [26]:
df.dtypes

id                          int64
athleteId                 float64
recruitType                object
year                        int64
ranking                   float64
name                       object
school                     object
committedTo                object
position                   object
height                    float64
weight                    float64
stars                       int64
rating                    float64
city                       object
stateProvince              object
country                    object
hometownInfo.latitude     float64
hometownInfo.longitude    float64
hometownInfo.fipsCode     float64
dtype: object

In [27]:
df.isnull().sum(axis = 0)

id                            0
athleteId                 11704
recruitType                   0
year                          0
ranking                     360
name                          0
school                      275
committedTo                5524
position                     18
height                       51
weight                       55
stars                         0
rating                        0
city                        298
stateProvince               300
country                     196
hometownInfo.latitude       380
hometownInfo.longitude      380
hometownInfo.fipsCode       391
dtype: int64

In [28]:
df.nunique(axis=0)

id                        27179
athleteId                 15182
recruitType                   1
year                          7
ranking                    4257
name                      25912
school                     5291
committedTo                 271
position                     22
height                       57
weight                      244
stars                         5
rating                     2186
city                       3034
stateProvince                60
country                      17
hometownInfo.latitude      3611
hometownInfo.longitude     3611
hometownInfo.fipsCode      1571
dtype: int64

In [29]:
df = df.drop(['athleteId', 'recruitType','country'], axis=1)

In [30]:
df['year'].value_counts()

2017    4359
2020    4307
2019    4165
2016    4053
2018    3950
2015    3608
2021    2737
Name: year, dtype: int64

In [31]:
df1 = df.drop_duplicates(
    subset=['year','ranking'], 
    keep="first").reset_index(drop=True)
print(df1.shape)
df1.head()

(26403, 16)


Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,31.578206,-84.155681,13095.0
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,28.677968,-81.511521,12095.0
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,27.998541,-82.274884,12057.0
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,33.769016,-118.191605,6037.0
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,0.9981,Auburndale,FL,28.107088,-81.80358,12105.0


In [32]:
df1.isnull().sum(axis = 0)

id                           0
year                         0
ranking                      3
name                         0
school                      30
committedTo               5361
position                    10
height                      46
weight                      50
stars                        0
rating                       0
city                       125
stateProvince              127
hometownInfo.latitude      204
hometownInfo.longitude     204
hometownInfo.fipsCode      213
dtype: int64

In [33]:
df1['year'].value_counts()

2017    4252
2019    4090
2016    3978
2020    3934
2018    3887
2015    3547
2021    2715
Name: year, dtype: int64

In [34]:
df1[['committedTo']] = df1[['committedTo']].fillna("Uncommitted")

In [35]:
# pd.set_option('display.max_rows',500)

df1_group = df1.groupby(['committedTo','year']).size().unstack(fill_value=0)
df1_group.head()

year,2015,2016,2017,2018,2019,2020,2021
committedTo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Abilene Christian,2,8,3,0,0,0,0
Air Force,22,40,59,26,51,33,47
Akron,14,7,20,14,22,17,16
Alabama,24,22,27,21,27,24,26
Alabama A&M,0,1,1,1,1,0,1


In [36]:
# check uncommitted recruit totals
df1_group.query('committedTo== "Uncommitted"')

year,2015,2016,2017,2018,2019,2020,2021
committedTo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Uncommitted,549,763,958,829,1042,996,224


In [37]:
df1.isnull().sum(axis = 0)

id                          0
year                        0
ranking                     3
name                        0
school                     30
committedTo                 0
position                   10
height                     46
weight                     50
stars                       0
rating                      0
city                      125
stateProvince             127
hometownInfo.latitude     204
hometownInfo.longitude    204
hometownInfo.fipsCode     213
dtype: int64

In [38]:
# drop rows containing NaN values for selected columns

df1.dropna(subset=['school','ranking', 'position','city', 'stateProvince',
                   'hometownInfo.latitude','hometownInfo.longitude','hometownInfo.fipsCode'], inplace=True)

print(df1.shape)
df1.head()

(26159, 16)


Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,31.578206,-84.155681,13095.0
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,28.677968,-81.511521,12095.0
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,27.998541,-82.274884,12057.0
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,33.769016,-118.191605,6037.0
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,0.9981,Auburndale,FL,28.107088,-81.80358,12105.0


In [39]:
df_sjs1 = df1[df1['committedTo'].str.contains("San")]
df_sjs1

Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
217,32075,2015,218.0,Desean Holmes,Bishop Alemany,San Diego State,WR,70.5,170.0,4,0.9127,Pasadena,CA,34.147645,-118.144478,6037.0
244,32124,2015,245.0,Kyahva Tezino,Salesian,San Diego State,OLB,72.0,207.0,4,0.9080,Los Angeles,CA,34.053691,-118.242767,6037.0
327,32202,2015,329.0,Kanya Bell,Long Beach Poly,San JosÃ© State,WR,72.0,160.0,4,0.8914,Diamond Bar,CA,34.028623,-117.810337,6037.0
352,32205,2015,354.0,Taeon Mason,John Muir,San JosÃ© State,CB,72.0,170.0,3,0.8887,Pasadena,CA,34.147645,-118.144478,6037.0
680,32537,2015,682.0,Jeremy Kelly,Salesian,San JosÃ© State,WR,73.0,169.0,3,0.8584,Los Angeles,CA,34.053691,-118.242767,6037.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26040,73778,2021,2354.0,D.J. Bryant,San Leandro,San Diego State,DUAL,72.0,170.0,3,0.8024,San Leandro,CA,37.724930,-122.156077,6001.0
26146,73870,2021,2460.0,Robert Rigsby,Judson,UT San Antonio,OC,75.0,310.0,2,0.7963,Converse,TX,29.518008,-98.316124,48029.0
26211,73925,2021,2525.0,Kekona Tinajero,Rio Mesa,San Diego,ATH,72.0,175.0,2,0.7932,Oxnard,CA,34.197631,-119.180382,6111.0
26258,73967,2021,2572.0,Ryan Stewart,Lincoln-Way East,San JosÃ© State,OT,77.0,240.0,2,0.7889,Frankfort,IL,41.495866,-87.848661,17197.0


In [40]:
df1.loc[df1.committedTo.str.contains('San J', na=False), 'committedTo'] = 'San José State'

df1.loc[df1.committedTo.str.contains('San J', na=False)].head()

Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
327,32202,2015,329.0,Kanya Bell,Long Beach Poly,San José State,WR,72.0,160.0,4,0.8914,Diamond Bar,CA,34.028623,-117.810337,6037.0
352,32205,2015,354.0,Taeon Mason,John Muir,San José State,CB,72.0,170.0,3,0.8887,Pasadena,CA,34.147645,-118.144478,6037.0
680,32537,2015,682.0,Jeremy Kelly,Salesian,San José State,WR,73.0,169.0,3,0.8584,Los Angeles,CA,34.053691,-118.242767,6037.0
886,32742,2015,888.0,Malik Roberson,Junipero Serra,San José State,APB,67.0,170.0,3,0.8497,Gardena,CA,33.896359,-118.305304,6037.0
910,32767,2015,912.0,Dominic Fredrickson,Freedom,San José State,OG,75.0,295.0,3,0.8487,Oakley,CA,37.997422,-121.712454,6013.0


In [41]:
df1.isnull().sum(axis = 0)

id                         0
year                       0
ranking                    0
name                       0
school                     0
committedTo                0
position                   0
height                    43
weight                    46
stars                      0
rating                     0
city                       0
stateProvince              0
hometownInfo.latitude      0
hometownInfo.longitude     0
hometownInfo.fipsCode      0
dtype: int64

In [42]:
# check recruits who do not have height listed
df1[df1['height'].isna()].head()

Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
1328,66964,2015,1331.0,Ronique Robinson,Booker T. Washington,Uncommitted,WDE,,,3,0.8333,Miami,FL,25.774266,-80.193659,12086.0
2962,67114,2015,2969.0,Kiki Hill,Goliad,Uncommitted,WR,,,2,0.7667,Goliad,TX,28.668656,-97.390912,48175.0
5059,36638,2016,1513.0,Kailen Guillory,Ben Davis,Western Michigan,WDE,,,3,0.824,Indianapolis,IN,39.768333,-86.15835,18097.0
5535,37099,2016,1989.0,Taeshon Trotter,Cass Technical,Ohio,OT,,,3,0.8039,Detroit,MI,42.331551,-83.04664,26163.0
5540,37104,2016,1994.0,Kee Whetzel,Countryside,Oregon State,WR,,186.0,3,0.8036,Clearwater,FL,27.965853,-82.800103,12103.0


In [43]:
# impute mean height by position to missing height values
df1.height = df1.groupby('position')['height'].apply(lambda x: x.fillna(x.mean()))
df1.height = df1.height.fillna(df1.height.mean())

In [44]:
# impute mean weight by position to missing weight values
df1.weight = df1.groupby('position')['weight'].apply(lambda x: x.fillna(x.mean()))
df1.weight = df1.weight.fillna(df1.weight.mean())

In [45]:
# check for one of the players without height/weight listed 

df1[df1['name'] == "Kiki Hill"]

Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode
2962,67114,2015,2969.0,Kiki Hill,Goliad,Uncommitted,WR,72.887123,182.179795,2,0.7667,Goliad,TX,28.668656,-97.390912,48175.0


In [46]:
# ensure all rows have values

df1.isnull().sum(axis = 0)

id                        0
year                      0
ranking                   0
name                      0
school                    0
committedTo               0
position                  0
height                    0
weight                    0
stars                     0
rating                    0
city                      0
stateProvince             0
hometownInfo.latitude     0
hometownInfo.longitude    0
hometownInfo.fipsCode     0
dtype: int64

## 2. Add County names based on FIPS code

In [47]:
# load new dataframe with fips codes and county names

url ='https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt'
df_5 = pd.read_csv(url, header=None, sep=",",names=['state','fips1','fips2','county','E'],dtype=str)
df_5.head()

Unnamed: 0,state,fips1,fips2,county,E
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [48]:
df_5['hometownInfo.fipsCode'] = df_5['fips1'] + df_5['fips2']
df_5.head()

Unnamed: 0,state,fips1,fips2,county,E,hometownInfo.fipsCode
0,AL,1,1,Autauga County,H1,1001
1,AL,1,3,Baldwin County,H1,1003
2,AL,1,5,Barbour County,H1,1005
3,AL,1,7,Bibb County,H1,1007
4,AL,1,9,Blount County,H1,1009


In [49]:
df6 = df_5[['hometownInfo.fipsCode','county']].copy()
df6['hometownInfo.fipsCode'] = pd.to_numeric(df6['hometownInfo.fipsCode'])
df6.head()

Unnamed: 0,hometownInfo.fipsCode,county
0,1001,Autauga County
1,1003,Baldwin County
2,1005,Barbour County
3,1007,Bibb County
4,1009,Blount County


In [50]:
# merge county names onto existing dataframe

df2 = df1.merge(df6, on='hometownInfo.fipsCode', how='left')
df2.head(10)

Unnamed: 0,id,year,ranking,name,school,committedTo,position,height,weight,stars,rating,city,stateProvince,hometownInfo.latitude,hometownInfo.longitude,hometownInfo.fipsCode,county
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,31.578206,-84.155681,13095.0,Dougherty County
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,28.677968,-81.511521,12095.0,Orange County
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,27.998541,-82.274884,12057.0,Hillsborough County
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,33.769016,-118.191605,6037.0,Los Angeles County
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,0.9981,Auburndale,FL,28.107088,-81.80358,12105.0,Polk County
5,31865,2015,6.0,Kahlil McKenzie,Clayton Valley,Tennessee,DT,75.0,339.0,5,0.9968,Concord,CA,37.976852,-122.033562,6013.0,Contra Costa County
6,31866,2015,7.0,CeCe Jefferson,Baker County Senior,Florida,SDE,74.0,275.0,5,0.9951,Glen Saint Mary,FL,30.275791,-82.160669,12003.0,Baker County
7,31867,2015,8.0,Josh Sweat,Oscar Smith,Florida State,WDE,77.0,240.0,5,0.9948,Chesapeake,VA,36.718371,-76.24668,51550.0,Chesapeake city
8,31868,2015,9.0,Kevin Toliver II,Trinity Christian Academy,LSU,CB,74.0,185.0,5,0.9948,Jacksonville,FL,30.332184,-81.655651,12031.0,Duval County
9,31869,2015,10.0,Malik Jefferson,Poteet,Texas,OLB,74.5,215.0,5,0.9929,Mesquite,TX,32.76661,-96.599472,48113.0,Dallas County


In [51]:
df2.isnull().sum(axis = 0)

id                        0
year                      0
ranking                   0
name                      0
school                    0
committedTo               0
position                  0
height                    0
weight                    0
stars                     0
rating                    0
city                      0
stateProvince             0
hometownInfo.latitude     0
hometownInfo.longitude    0
hometownInfo.fipsCode     0
county                    0
dtype: int64

In [52]:
# rename columns for more accurate/clean headers

df2.rename({'school': 'high_school', 'committedTo': 'college_commit', 'stateProvince': 'state',
           'hometownInfo.latitude': 'hometown_lat', 'hometownInfo.longitude': 'hometown_long', 
            'hometownInfo.fipsCode': 'hometown_fips'}, axis=1, inplace=True)

print(df2.shape)
df2.head()

(26159, 17)


Unnamed: 0,id,year,ranking,name,high_school,college_commit,position,height,weight,stars,rating,city,state,hometown_lat,hometown_long,hometown_fips,county
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,31.578206,-84.155681,13095.0,Dougherty County
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,28.677968,-81.511521,12095.0,Orange County
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,27.998541,-82.274884,12057.0,Hillsborough County
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,33.769016,-118.191605,6037.0,Los Angeles County
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,0.9981,Auburndale,FL,28.107088,-81.80358,12105.0,Polk County


In [53]:
# check data types 

df2.dtypes

id                  int64
year                int64
ranking           float64
name               object
high_school        object
college_commit     object
position           object
height            float64
weight            float64
stars               int64
rating            float64
city               object
state              object
hometown_lat      float64
hometown_long     float64
hometown_fips     float64
county             object
dtype: object

In [54]:
# merge position data onto existing dateframe

df3 = df2.merge(df_pos, on='position', how='left')
print(df3.shape)
df3.head(10)

(26159, 20)


Unnamed: 0,id,year,ranking,name,high_school,college_commit,position,height,weight,stars,rating,city,state,hometown_lat,hometown_long,hometown_fips,county,position_name,position_group,type
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,0.9992,Albany,GA,31.578206,-84.155681,13095.0,Dougherty County,Defensive Tackle,Defensive Line,Defense
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,0.999,Apopka,FL,28.677968,-81.511521,12095.0,Orange County,Offensive Tackle,Offensive Line,Offense
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,0.9987,Seffner,FL,27.998541,-82.274884,12057.0,Hillsborough County,Strong-Side Defensive End,Defensive Line,Defense
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,0.9985,Long Beach,CA,33.769016,-118.191605,6037.0,Los Angeles County,Cornerback,Defensive Back,Defense
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,0.9981,Auburndale,FL,28.107088,-81.80358,12105.0,Polk County,Safety,Defensive Back,Defense
5,31865,2015,6.0,Kahlil McKenzie,Clayton Valley,Tennessee,DT,75.0,339.0,5,0.9968,Concord,CA,37.976852,-122.033562,6013.0,Contra Costa County,Defensive Tackle,Defensive Line,Defense
6,31866,2015,7.0,CeCe Jefferson,Baker County Senior,Florida,SDE,74.0,275.0,5,0.9951,Glen Saint Mary,FL,30.275791,-82.160669,12003.0,Baker County,Strong-Side Defensive End,Defensive Line,Defense
7,31867,2015,8.0,Josh Sweat,Oscar Smith,Florida State,WDE,77.0,240.0,5,0.9948,Chesapeake,VA,36.718371,-76.24668,51550.0,Chesapeake city,Weak-Side Defensive End,Defensive Line,Defense
8,31868,2015,9.0,Kevin Toliver II,Trinity Christian Academy,LSU,CB,74.0,185.0,5,0.9948,Jacksonville,FL,30.332184,-81.655651,12031.0,Duval County,Cornerback,Defensive Back,Defense
9,31869,2015,10.0,Malik Jefferson,Poteet,Texas,OLB,74.5,215.0,5,0.9929,Mesquite,TX,32.76661,-96.599472,48113.0,Dallas County,Outside Linebacker,Linebacker,Defense


In [55]:
df4 = df3.merge(df_team1, on='college_commit', how='left')
print(df4.shape)
df4.head(10)

(26159, 26)


Unnamed: 0,id,year,ranking,name,high_school,college_commit,position,height,weight,stars,...,county,position_name,position_group,type,conference,division,college_city,college_state,college_lat,college_long
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,...,Dougherty County,Defensive Tackle,Defensive Line,Defense,SEC,East,Athens,GA,33.94982,-83.373381
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,...,Orange County,Offensive Tackle,Offensive Line,Offense,SEC,East,Gainesville,FL,29.649936,-82.348579
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,...,Hillsborough County,Strong-Side Defensive End,Defensive Line,Defense,SEC,West,Auburn,AL,32.602553,-85.489748
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,...,Los Angeles County,Cornerback,Defensive Back,Defense,Pac-12,South,Los Angeles,CA,34.014167,-118.287778
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,...,Polk County,Safety,Defensive Back,Defense,ACC,Atlantic,Tallahassee,FL,30.438169,-84.304403
5,31865,2015,6.0,Kahlil McKenzie,Clayton Valley,Tennessee,DT,75.0,339.0,5,...,Contra Costa County,Defensive Tackle,Defensive Line,Defense,SEC,East,Knoxville,TN,35.955013,-83.925013
6,31866,2015,7.0,CeCe Jefferson,Baker County Senior,Florida,SDE,74.0,275.0,5,...,Baker County,Strong-Side Defensive End,Defensive Line,Defense,SEC,East,Gainesville,FL,29.649936,-82.348579
7,31867,2015,8.0,Josh Sweat,Oscar Smith,Florida State,WDE,77.0,240.0,5,...,Chesapeake city,Weak-Side Defensive End,Defensive Line,Defense,ACC,Atlantic,Tallahassee,FL,30.438169,-84.304403
8,31868,2015,9.0,Kevin Toliver II,Trinity Christian Academy,LSU,CB,74.0,185.0,5,...,Duval County,Cornerback,Defensive Back,Defense,SEC,West,Baton Rouge,LA,30.412035,-91.183816
9,31869,2015,10.0,Malik Jefferson,Poteet,Texas,OLB,74.5,215.0,5,...,Dallas County,Outside Linebacker,Linebacker,Defense,Big 12,fcs,Austin,TX,30.283681,-97.732534


In [56]:
df4.isnull().sum(axis = 0)

id                   0
year                 0
ranking              0
name                 0
high_school          0
college_commit       0
position             0
height               0
weight               0
stars                0
rating               0
city                 0
state                0
hometown_lat         0
hometown_long        0
hometown_fips        0
county               0
position_name        0
position_group       0
type                 0
conference        5326
division          5326
college_city      5326
college_state     5326
college_lat       5326
college_long      5326
dtype: int64

In [57]:
# Fill 'uncommitted' as conference value for uncommitted recruits

df4[['conference']] = df4[['conference']].fillna("None")

In [58]:
# Fill in hometown location data for uncommitted recruits

df4.loc[df4["college_city"].isnull(),'college_city'] = df4["city"]
df4.loc[df4["college_state"].isnull(),'college_state'] = df4["state"]
df4.loc[df4["college_lat"].isnull(),'college_lat'] = df4["hometown_lat"]
df4.loc[df4["college_long"].isnull(),'college_long'] = df4["hometown_long"]

In [59]:
print(df4.shape)
df4.head(10)

(26159, 26)


Unnamed: 0,id,year,ranking,name,high_school,college_commit,position,height,weight,stars,...,county,position_name,position_group,type,conference,division,college_city,college_state,college_lat,college_long
0,66928,2015,1.0,Trenton Thompson,Westover,Georgia,DT,74.0,313.0,5,...,Dougherty County,Defensive Tackle,Defensive Line,Defense,SEC,East,Athens,GA,33.94982,-83.373381
1,31861,2015,2.0,Martez Ivey,Apopka,Florida,OT,77.5,275.0,5,...,Orange County,Offensive Tackle,Offensive Line,Offense,SEC,East,Gainesville,FL,29.649936,-82.348579
2,31862,2015,3.0,Byron Cowart,Armwood,Auburn,SDE,76.0,250.0,5,...,Hillsborough County,Strong-Side Defensive End,Defensive Line,Defense,SEC,West,Auburn,AL,32.602553,-85.489748
3,31863,2015,4.0,Iman Marshall,Long Beach Poly,USC,CB,73.0,190.0,5,...,Los Angeles County,Cornerback,Defensive Back,Defense,Pac-12,South,Los Angeles,CA,34.014167,-118.287778
4,31864,2015,5.0,Derwin James,Haines City Senior,Florida State,S,74.0,201.0,5,...,Polk County,Safety,Defensive Back,Defense,ACC,Atlantic,Tallahassee,FL,30.438169,-84.304403
5,31865,2015,6.0,Kahlil McKenzie,Clayton Valley,Tennessee,DT,75.0,339.0,5,...,Contra Costa County,Defensive Tackle,Defensive Line,Defense,SEC,East,Knoxville,TN,35.955013,-83.925013
6,31866,2015,7.0,CeCe Jefferson,Baker County Senior,Florida,SDE,74.0,275.0,5,...,Baker County,Strong-Side Defensive End,Defensive Line,Defense,SEC,East,Gainesville,FL,29.649936,-82.348579
7,31867,2015,8.0,Josh Sweat,Oscar Smith,Florida State,WDE,77.0,240.0,5,...,Chesapeake city,Weak-Side Defensive End,Defensive Line,Defense,ACC,Atlantic,Tallahassee,FL,30.438169,-84.304403
8,31868,2015,9.0,Kevin Toliver II,Trinity Christian Academy,LSU,CB,74.0,185.0,5,...,Duval County,Cornerback,Defensive Back,Defense,SEC,West,Baton Rouge,LA,30.412035,-91.183816
9,31869,2015,10.0,Malik Jefferson,Poteet,Texas,OLB,74.5,215.0,5,...,Dallas County,Outside Linebacker,Linebacker,Defense,Big 12,fcs,Austin,TX,30.283681,-97.732534


In [60]:
df4.isnull().sum(axis = 0)

id                   0
year                 0
ranking              0
name                 0
high_school          0
college_commit       0
position             0
height               0
weight               0
stars                0
rating               0
city                 0
state                0
hometown_lat         0
hometown_long        0
hometown_fips        0
county               0
position_name        0
position_group       0
type                 0
conference           0
division          5326
college_city         0
college_state        0
college_lat          0
college_long         0
dtype: int64

## 3. Export to CSV to check

In [61]:
# check csv file for errors

df4.to_csv('2015_2021_Recruits_V6.csv',encoding='utf-8-sig')

## 4. Node Prep: Final Dataframes after Cleaning

In [62]:
# Players Node

player_df = df4[['id','year','name','height','weight']]
print(player_df.shape)
player_df.head()

(26159, 5)


Unnamed: 0,id,year,name,height,weight
0,66928,2015,Trenton Thompson,74.0,313.0
1,31861,2015,Martez Ivey,77.5,275.0
2,31862,2015,Byron Cowart,76.0,250.0
3,31863,2015,Iman Marshall,73.0,190.0
4,31864,2015,Derwin James,74.0,201.0


In [63]:
# Ratings Node

rating_df = df4[['id','year','ranking','stars','rating']]
print(rating_df.shape)
rating_df.head()

(26159, 5)


Unnamed: 0,id,year,ranking,stars,rating
0,66928,2015,1.0,5,0.9992
1,31861,2015,2.0,5,0.999
2,31862,2015,3.0,5,0.9987
3,31863,2015,4.0,5,0.9985
4,31864,2015,5.0,5,0.9981


In [64]:
# Hometown Node

hometown_df = df4[['id','city','hometown_fips','county','state','hometown_lat','hometown_long']]
print(hometown_df.shape)
hometown_df.head()

(26159, 7)


Unnamed: 0,id,city,hometown_fips,county,state,hometown_lat,hometown_long
0,66928,Albany,13095.0,Dougherty County,GA,31.578206,-84.155681
1,31861,Apopka,12095.0,Orange County,FL,28.677968,-81.511521
2,31862,Seffner,12057.0,Hillsborough County,FL,27.998541,-82.274884
3,31863,Long Beach,6037.0,Los Angeles County,CA,33.769016,-118.191605
4,31864,Auburndale,12105.0,Polk County,FL,28.107088,-81.80358


In [65]:
# Position Node

position_df = df4[['id','position','position_name','position_group','type']]
print(position_df.shape)
position_df.head()

(26159, 5)


Unnamed: 0,id,position,position_name,position_group,type
0,66928,DT,Defensive Tackle,Defensive Line,Defense
1,31861,OT,Offensive Tackle,Offensive Line,Offense
2,31862,SDE,Strong-Side Defensive End,Defensive Line,Defense
3,31863,CB,Cornerback,Defensive Back,Defense
4,31864,S,Safety,Defensive Back,Defense


In [66]:
# School Node

school_df = df4[['id','high_school']]
print(school_df.shape)
school_df.head()

(26159, 2)


Unnamed: 0,id,high_school
0,66928,Westover
1,31861,Apopka
2,31862,Armwood
3,31863,Long Beach Poly
4,31864,Haines City Senior


In [67]:
# College Node

college_df = df4[['id','college_commit','conference','college_state','college_lat','college_long']]
print(college_df.shape)
college_df.head()

(26159, 6)


Unnamed: 0,id,college_commit,conference,college_state,college_lat,college_long
0,66928,Georgia,SEC,GA,33.94982,-83.373381
1,31861,Florida,SEC,FL,29.649936,-82.348579
2,31862,Auburn,SEC,AL,32.602553,-85.489748
3,31863,USC,Pac-12,CA,34.014167,-118.287778
4,31864,Florida State,ACC,FL,30.438169,-84.304403


In [68]:
# export dataframes 

player_df.to_csv('data/player.csv', index=False)
rating_df.to_csv('data/rating.csv', index=False)
hometown_df.to_csv('data/hometown.csv', index=False)
position_df.to_csv('data/positionV2.csv', index=False)
school_df.to_csv('data/school.csv', index=False)
college_df.to_csv('data/collegeV2.csv', index=False)

## 5. Cypher Import Script

In [69]:
CREATE CONSTRAINT ON (n:Player) ASSERT n.id is UNIQUE;
CREATE CONSTRAINT ON (n:Rating) ASSERT n.id is UNIQUE; 
CREATE CONSTRAINT ON (n:Hometown) ASSERT n.id is UNIQUE; 
CREATE CONSTRAINT ON (n:Position) ASSERT n.position is UNIQUE; 
CREATE CONSTRAINT ON (n:School) ASSERT n.high_school is UNIQUE; 
CREATE CONSTRAINT ON (n:College) ASSERT n.college_commit is UNIQUE;

LOAD CSV WITH HEADERS FROM 'file:///player.csv' AS row
CREATE(:Player
{
    `id`: row.`id`,
    `year`: row.`year`,
    `name`: row.`name`,
    `height`: row.`height`,
    `weight`: row.`weight` 
});

LOAD CSV WITH HEADERS FROM 'file:///rating.csv' AS row
MERGE (n:Rating {year: row.year,
    ranking: row.ranking,
    stars: row.stars,
    rating: row.rating}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)-[:HAS_RATING]->(n)

LOAD CSV WITH HEADERS FROM 'file:///hometown.csv' AS row
MERGE (n:Hometown {city: row.city,
    hometown_fips: row.hometown_fips,
    county: row.county,
    state: row.state,
    hometown_lat: row.hometown_lat,
    hometown_long: row.hometown_long}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:IS_FROM]-(n)

LOAD CSV WITH HEADERS FROM 'file:///college.csv' AS row
MERGE (n:College {name: row.college_commit}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:COMMITTED_TO]-(n)

LOAD CSV WITH HEADERS FROM 'file:///school.csv' AS row
MERGE (n:School {name: row.school}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:PLAYED_AT]-(n)

LOAD CSV WITH HEADERS FROM 'file:///position.csv' AS row 
MERGE (n:Position {name: row.position}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)-[:HAS_POSITION]->(n);

SyntaxError: invalid syntax (<ipython-input-69-06b834418779>, line 1)

In [None]:
//Create Players
LOAD CSV WITH HEADERS FROM 'file:///player.csv' AS row
CREATE(:Player
{
    `id`: row.`id`,
    `year`: row.`year`,
    `name`: row.`name`,
    `height`: row.`height`,
    `weight`: row.`weight` 
});

In [None]:
//Creating Rating
LOAD CSV WITH HEADERS FROM 'file:///rating.csv' AS row
MERGE (n:Rating {year: row.year,
    ranking: row.ranking,
    stars: row.stars,
    rating: row.rating}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)-[:HAS_RATING]->(n)

In [None]:
//Create Hometown
LOAD CSV WITH HEADERS FROM 'file:///hometown.csv' AS row
MERGE (n:Hometown {name: row.city,
    hometown_fips: row.hometown_fips,
    county: row.county,
    state: row.state,
    hometown_lat: row.hometown_lat,
    hometown_long: row.hometown_long}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:IS_FROM]-(n)

In [None]:
//Create College 
LOAD CSV WITH HEADERS FROM 'file:///collegeV2.csv' AS row
MERGE (n:College {name: row.college_commit,
                 conference: row.conference,
                 division: row.division,
                 college_state: row.college_state,
                 college_lat: row.college_lat,
                 college_long: row.college_long}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:COMMITTED_TO]-(n)

In [None]:
//Create School
LOAD CSV WITH HEADERS FROM 'file:///school.csv' AS row
MERGE (n:School {name: row.school}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)<-[:PLAYED_AT]-(n)

In [None]:
//Create Position
LOAD CSV WITH HEADERS FROM 'file:///positionV2.csv' AS row 
MERGE (n:Position {position: row.position,
                  position_name: row.position_name,
                  position_group: row.position_group,
                  type: row.type}) 
WITH row, n 
MATCH(p:Player {id: row.id}) 
MERGE (p)-[:HAS_POSITION]->(n);

In [None]:
MATCH (n:Hometown{city:'Charlotte'})
CALL gds.alpha.spanningTree.minimum.write({
  startNodeId: id(n),
  nodeProjection: 'Hometown',
  relationshipProjection: {
    COST: {
      type: 'COST',
      properties: 'distance',
      orientation: 'UNDIRECTED'
    }
  },
  relationshipWeightProperty: 'distance',
  writeProperty: 'MINST',
  weightWriteProperty: 'cost'
})
YIELD createMillis, computeMillis, writeMillis, effectiveNodeCount
RETURN createMillis, computeMillis, writeMillis, effectiveNodeCount;

In [None]:
MATCH path = (t:Rating)-[:TARGET_LOCATED]->(n:Hometown {city: 'Charlotte'})-[:MINST*]-()
WHERE t.year = '2021' AND ('0.78' < t.rating < '0.84')
WITH relationships(path) AS rels
UNWIND rels AS rel
WITH DISTINCT rel AS rel
WHERE rel.cost IS NOT NULL
RETURN startNode(rel).city AS source, endNode(rel).city AS destination, rel.cost AS cost

In [None]:
MATCH (c:College)-[:COMMITTED_TO]->(n:Player)
WITH n, c 
MATCH (c:College)-[:COMMITTED_TO]->(n)<-[:IS_FROM]-(h:Hometown)
CREATE (c)-[r:HAS_PIPELINE {name: c.name}]->(h)
RETURN type(r), r.name

In [None]:
MATCH (c:College)-[:COMMITTED_TO]->(n:Player)
WITH n, c 
MATCH (c:College)-[:COMMITTED_TO]->(n)<-[:IS_FROM]-(h:Hometown)
WITH count(n) as e, c, h
CREATE (c)-[r:HAS_PIPELINE]->(h)
SET r.frequency = e
RETURN type(r), r.frequency

In [None]:
MATCH (c1:College)-[r:HAS_PIPELINE]->(h:Hometown)<-[:HAS_PIPELINE]-(c2:College)
WITH c1, c2, count(r) as Cnt
WHERE c1.name <> 'Uncommitted' AND c2.name <> 'Uncommitted'
CREATE (c1)-[r:SHARES_TERRITORY]->(c2)
SET r.city_count = Cnt
RETURN type(r), r.city_count