# Scraping/pulling to photo source URL

First, I'm looking at the old scrape for tips and to identify any url patterns.

In [26]:
import pandas as pd
import re

In [27]:
# data = pd.read_csv('attribute_table.csv')
data = pd.read_json('sync.p.json')
data.rename(columns={'Photo_source': 'photo_src'}, inplace=True)

In [28]:
# making sure there arent repeats to identify a true pattern
checking_for_duplicate_srcs = list(data.duplicated('photo_src'))
if True in checking_for_duplicate_srcs:
    data.drop_duplicates(subset = ['photo_src'], inplace= True)
else:
    pass


In [29]:
pd.set_option('display.max_colwidth', None)
subset_data = data[['photo_src', 'URL', 'PhotoID']].sort_values('photo_src')
subset_data

Unnamed: 0,photo_src,URL,PhotoID
64,https://live.staticflickr.com/65535/49742499418_fbdbb78e78_o.jpg,https://www.flickr.com/photos/fractracker/49742499418/in/album-72157715839176878,49742499418
74,https://live.staticflickr.com/65535/49743045891_31d576a54d_o.jpg,https://www.flickr.com/photos/fractracker/49743045891/in/album-72157715839176878,49743045891
66,https://live.staticflickr.com/65535/49743046746_45d25cf8c4_o.jpg,https://www.flickr.com/photos/fractracker/49743046746/in/album-72157715839176878,49743046746
179,https://live.staticflickr.com/65535/49743350002_5993070b9f_o.jpg,https://www.flickr.com/photos/fractracker/49743350002/in/album-72157713808860567,49743350002
65,https://live.staticflickr.com/65535/49743369437_eea0df6137_o.jpg,https://www.flickr.com/photos/fractracker/49743369437/in/album-72157715839176878,49743369437
...,...,...,...
193,https://live.staticflickr.com/65535/51323278290_535768ca72_o.jpg,https://www.flickr.com/photos/fractracker/51323278290/in/album-72157713808860567,51323278290
198,https://live.staticflickr.com/65535/51388234837_94ba140c4e_o.jpg,https://www.flickr.com/photos/fractracker/51388234837/in/album-72157713808860567,51388234837
200,https://live.staticflickr.com/65535/51389248318_5b2071e7cd_o.jpg,https://www.flickr.com/photos/fractracker/51389248318/in/album-72157713808860567,51389248318
199,https://live.staticflickr.com/65535/51389248718_60595d0525_o.jpg,https://www.flickr.com/photos/fractracker/51389248718/in/album-72157713808860567,51389248718


## Checker:

## 1.

In [30]:
# it looks like the beginning part of src is the same lests see
def get_beginning_str_src(data):
    return data[0:len('https://live.staticflickr.com/')]

In [31]:
# if the list is longer that 1, there are different beginnings to the string. If it =1, all the strings start the same
len(subset_data['photo_src'].apply(get_beginning_str_src).unique())

1

In [32]:
print('The first', len('https://live.staticflickr.com/'), 'characters of `photo_src` are always https://live.staticflickr.com/' )

The first 30 characters of `photo_src` are always https://live.staticflickr.com/


## 2.

In [33]:
# it looks like 5 digits always follow the beginning https://live.staticflickr.com/
# to check this, i will grab all the characters between the third and fourth "/", check that there are 5 characters, and check that it is an integer
def get_text_between_third_and_fourth_slash(url):
    # Split the string by "/"
    parts = url.split('/')
    # Check if there are enough parts
    if len(parts) > 4:
        # Return the part between the third and fourth slash
        return parts[3]
    else:
        return "NaN"

In [34]:
five_digit_check = pd.DataFrame(subset_data.photo_src.apply(get_text_between_third_and_fourth_slash))

In [35]:
# if it is false, all of the values were successfully grabbed
five_digit_check.isna().value_counts()

photo_src
False        199
dtype: int64

In [36]:
print('it is originally an', five_digit_check.dtypes[0])
print('and we can successfully convert it to', five_digit_check.astype('int').dtypes[0], 'meaning that they are all integers')

it is originally an object
and we can successfully convert it to int32 meaning that they are all integers


In [37]:
print('number of digits | Percent of data')
five_digit_check.photo_src.apply(lambda x: len(x)).value_counts()/subset_data.shape[0]*100

number of digits | Percent of data


5    100.0
Name: photo_src, dtype: float64

## 3.

In [38]:
# it looks like the middle part of photo src is identical to he middle part of url and PhotoID, lets see
def get_middle_str_src(data):
    return data[36:47]

def get_middle_str_url(data):
    return data[42:53]

In [39]:
# running the functions and adding it to a dataframe
middle_chars = pd.DataFrame({'src_middle': subset_data['photo_src'].apply(get_middle_str_src), 'URL_middle': subset_data['URL'].apply(get_middle_str_url), 'PhotoID': subset_data['PhotoID'].astype('str')})

In [40]:
middle_chars

Unnamed: 0,src_middle,URL_middle,PhotoID
64,49742499418,49742499418,49742499418
74,49743045891,49743045891,49743045891
66,49743046746,49743046746,49743046746
179,49743350002,49743350002,49743350002
65,49743369437,49743369437,49743369437
...,...,...,...
193,51323278290,51323278290,51323278290
198,51388234837,51388234837,51388234837
200,51389248318,51389248318,51389248318
199,51389248718,51389248718,51389248718


In [41]:
# comparing the values. if all entries return true, then these sections are identical across the photo_src and URL
pd.Series(middle_chars['src_middle'] == middle_chars['URL_middle']).value_counts()

True    199
dtype: int64

In [42]:
pd.Series(middle_chars['src_middle'] == middle_chars['PhotoID']).value_counts()

True    199
dtype: int64

In [43]:
print('Character number', len('https://live.staticflickr.com/31337'), 'through character number', len('https://live.staticflickr.com/31337/50199457303'), 
      'of `photo_src` match character number', len('https://www.flickr.com/photos/fractracker'), 'through character number', 
      len('https://www.flickr.com/photos/fractracker/50199457303'), 'of `URL` and the `PhotoID`.')

Character number 35 through character number 47 of `photo_src` match character number 41 through character number 53 of `URL` and the `PhotoID`.


## 4.

In [44]:
# finding the length of the digits in the second middle squence
len('https://live.staticflickr.com/31337/50199457303_12742db461_o.jpg') - len('https://live.staticflickr.com/31337/50199457303_') - len('_o.jpg')

10

In [45]:
# it looks like every photo source url ends in this sequence: _ + 10 characters + _o. + 3 characters
def check_end_pattern(url):
    # Regular expression to match the pattern
    pattern = r'_[A-Za-z0-9]{10}_o\.[a-z]{3}$'

    # Search for the pattern in the string
    match = re.search(pattern, url)
    
    # Return True if the pattern is found, otherwise False
    return bool(match)

In [46]:
# if they all follow the pattern, they should all be true
subset_data.photo_src.apply(check_end_pattern).value_counts()

True    199
Name: photo_src, dtype: int64