# Introduction

Unsplash dataset (a open-source images dataset) lite version is utilized for this model. The lite dataset includes 25,000 images, 25,000 keywords and 1,000,000 searches.

This page of code is for the realization of prediction model of nation estimation and photo recommendations. Some information are taken from Weijie Chen's a2, "Unsplash dataset exploration". 

# Dataset loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import glob
from collections import Counter

In [None]:
path = './'
documents = ['photos', 'keywords', 'collections', 'conversions', 'colors']
datasets = {}
for doc in documents:
    files = glob.glob(path + doc + ".tsv*")
    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)

    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)  

# Data sorting

## Color

Pick the top six colors for the prediction model

In [None]:
color_np = datasets['colors'].to_numpy()
color_name = color_np[:, 5]
top_colors_file = Counter(color_name.flat).most_common(10)
top_colors_file 

[('darkslategray', 36493),
 ('dimgray', 20776),
 ('darkolivegreen', 17740),
 ('black', 16149),
 ('gray', 14245),
 ('darkgray', 12816),
 ('silver', 12558),
 ('rosybrown', 11644),
 ('saddlebrown', 7863),
 ('sienna', 7313)]

## Keyword 

Pick the top six keywords for the prediction model

In [None]:
cnv_np = datasets['conversions'].to_numpy()
keyword_name = cnv_np[:, 2]
top_keyword_file = Counter(keyword_name.flat).most_common(24)
top_keyword_file 

[('nature', 260985),
 ('flowers', 152098),
 ('natural', 148573),
 ('mountain', 112230),
 ('sea', 108634),
 ('flower', 108199),
 ('sky', 104738),
 ('forest', 93503),
 ('ocean', 80329),
 ('dog', 78163),
 ('space', 70042),
 ('cat', 66788),
 ('beach', 62631),
 ('water', 55574),
 ('summer', 52543),
 ('mountains', 52003),
 ('moon', 49961),
 ('night', 49339),
 ('landscape', 48172),
 ('sunrise', 46574),
 ('animal', 45840),
 ('rain', 45796),
 ('tree', 45648),
 ('wood', 44947)]

Combine similar keywords together:
1. nature = 'nature' + 'natural'
2. flower = 'flowers' + 'flower'
3. sea = 'sea' + 'ocean'

Then the top six keywords are:
nature, flower, mountain, sea, sky, forest

# Camera preferences

Pick the top three camera brands for the prediction model

In [None]:
cmr_np = datasets['photos'].to_numpy()
camera_name = cmr_np[:, 12]
# resort the camera brand
new = []
for x in camera_name:
    if x==x:
        new.append((x.split()[0]).lower())
    else:
        new.append(x)
camera_name = new
top_camera_file = Counter(camera_name).most_common(3)
top_camera_file 

[('canon', 8708), ('nikon', 4700), ('sony', 4052)]

Remove the nan data and the top ten camera brands for the prediction model would be: Canon, Nikon, Sony

# Map preparation

Prepare for the general introduction with maps. The first part is to generalize a map with highest-download-rate photo in each nation and the second part is to generalize a map with highest-download-rate photo in each nation and the specific keyword.

In [None]:
keyword_table = datasets['conversions'][['photo_id','keyword']]
country_table = datasets['photos'][['photo_id','photo_url','photo_location_country','photo_location_latitude','photo_location_longitude','stats_downloads','ai_description','photo_description']]
newkeyword = keyword_table.replace({'keyword':'natural'},{'keyword':'nature'}, regex=True)
newkeyword = newkeyword.replace({'keyword':'flowers'},{'keyword':'flower'}, regex=True)
newkeyword = newkeyword.replace({'keyword':'ocean'},{'keyword':'sea'}, regex=True)
keyword_table = newkeyword[newkeyword.keyword.isin(['nature', 'flower', 'mountain', 'sea', 'sky', 'forest'])]
newcountry = country_table.dropna(subset = ['photo_location_country'])
newcountry = newcountry.replace({'photo_location_country':'USA'},{'photo_location_country':'United States'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'美國'},{'photo_location_country':'United States'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Deutschland'},{'photo_location_country':'Germany'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Schweiz'},{'photo_location_country':'Switzerland'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Italien'},{'photo_location_country':'Italy'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Italia'},{'photo_location_country':'Italy'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Россия'},{'photo_location_country':'Russia'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Brasil'},{'photo_location_country':'Brazil'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'中国'},{'photo_location_country':'China'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Österreich'},{'photo_location_country':'Austria'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'México'},{'photo_location_country':'Mexico'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Latvija'},{'photo_location_country':'Latvia'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'España'},{'photo_location_country':'Spain'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Nederland'},{'photo_location_country':'Netherlands'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'日本'},{'photo_location_country':'Japan'}, regex=True)



country_table = newcountry

country_np = country_table.to_numpy()
country_name = country_np[:, 2]
country_name_file = Counter(country_name.flat).most_common(50)
country_name_file

[('United States', 2385),
 ('Canada', 438),
 ('Italy', 428),
 ('United Kingdom', 424),
 ('Germany', 379),
 ('Australia', 341),
 ('France', 292),
 ('Iceland', 267),
 ('Switzerland', 245),
 ('Austria', 172),
 ('Indonesia', 127),
 ('Spain', 125),
 ('Brazil', 119),
 ('Russia', 118),
 ('New Zealand', 112),
 ('Portugal', 98),
 ('Mexico', 90),
 ('Netherlands', 85),
 ('India', 75),
 ('Norway', 75),
 ('South Africa', 72),
 ('Japan', 67),
 ('China', 65),
 ('Latvia', 63),
 ('Maldives', 58),
 ('Romania', 57),
 ('Greece', 52),
 ('Iran', 52),
 ('Poland', 46),
 ('Sweden', 46),
 ('Vietnam', 38),
 ('Thailand', 33),
 ('Chile', 33),
 ('Denmark', 32),
 ('Ukraine', 30),
 ('Argentina', 28),
 ('Turkey', 28),
 ('Hungary', 27),
 ('Singapore', 26),
 ('Philippines', 25),
 ('Slovenia', 25),
 ('Belgium', 24),
 ('Costa Rica', 23),
 ('Croatia', 23),
 ('Hong Kong', 22),
 ('Ireland', 22),
 ('Finland', 22),
 ('Taiwan', 22),
 ('Morocco', 21),
 ('Slovakia', 21)]

In [None]:
nations = ['United States', 'Canada', 'United Kingdom', 'Italy', 'Australia',
       'France', 'Iceland', 'Germany', 'Switzerland', 'Indonesia','Russia',
           'Brazil','New Zealand','India','Japan','China']
country_table = country_table[country_table.photo_location_country.isin(nations)]
featured_photo = country_table.loc[country_table.groupby(['photo_location_country'])['stats_downloads'].idxmax()]

featured_photo

Unnamed: 0,photo_id,photo_url,photo_location_country,photo_location_latitude,photo_location_longitude,stats_downloads,ai_description,photo_description
7887,tNDvFkxkBHo,https://unsplash.com/photos/tNDvFkxkBHo,Australia,,,228249,view of seashore sunset,Sunset over an Australian Beach
10712,QTIugFX1Gug,https://unsplash.com/photos/QTIugFX1Gug,Brazil,-23.013488,-43.32373,51768,two people surfing on water,Surfers
7585,lpjb_UMOyx8,https://unsplash.com/photos/lpjb_UMOyx8,Canada,51.425385,-116.177319,324889,photo of two mountains,Lake Louise landscape
17665,wpTWYBll4_w,https://unsplash.com/photos/wpTWYBll4_w,China,39.904211,116.407395,101617,yellow jelly fish,
10624,urUdKCxsTUI,https://unsplash.com/photos/urUdKCxsTUI,France,48.856614,2.352222,171664,shallow focus photography of white flowers,Spring flower blossoms on branch
14850,EPy0gBJzzZU,https://unsplash.com/photos/EPy0gBJzzZU,Germany,,,359925,green tree on grassland during daytime,Tree in green wheat field
2224,c9MFM8rSMsQ,https://unsplash.com/photos/c9MFM8rSMsQ,Iceland,63.61628,-20.013557,120209,,It’s often hard to see the beauty that’s right...
22063,83zRhEhFMfo,https://unsplash.com/photos/83zRhEhFMfo,India,17.796949,83.38161,151385,wayfarer sunglasses on beach sand during daytime,At the beach
19555,a8lTjWJJgLA,https://unsplash.com/photos/a8lTjWJJgLA,Indonesia,-8.409518,115.188916,106599,eagle-eye view photography of brown pathway,A Morning in Bali
15128,ln5drpv_ImI,https://unsplash.com/photos/ln5drpv_ImI,Italy,,,410213,silhouette photo of mountain during night time,Sublime purple night sky


In [None]:
keyword_merge = pd.merge(country_table, keyword_table, on='photo_id', how='inner')
keyword_photo = keyword_merge.loc[keyword_merge.groupby(['photo_location_country','keyword'])['stats_downloads'].idxmax()]
keyword_photo

Unnamed: 0,photo_id,photo_url,photo_location_country,photo_location_latitude,photo_location_longitude,stats_downloads,ai_description,photo_description,keyword
40672,swtg-ahmGzY,https://unsplash.com/photos/swtg-ahmGzY,Australia,,,9536,bee in fornt of Sunflower,Bees work very hard and do a fantastic job of ...,flower
252223,XK7thML3zEQ,https://unsplash.com/photos/XK7thML3zEQ,Australia,-25.274398,133.775136,189916,aerial photography of sea wave,Black sand beaches,forest
16903,iSYYLt2rKac,https://unsplash.com/photos/iSYYLt2rKac,Australia,-32.796609,134.198539,49462,galaxy during nighttime,absolutely zero light pollution. More on Inst...,mountain
130221,tNDvFkxkBHo,https://unsplash.com/photos/tNDvFkxkBHo,Australia,,,228249,view of seashore sunset,Sunset over an Australian Beach,nature
130220,tNDvFkxkBHo,https://unsplash.com/photos/tNDvFkxkBHo,Australia,,,228249,view of seashore sunset,Sunset over an Australian Beach,sea
130224,tNDvFkxkBHo,https://unsplash.com/photos/tNDvFkxkBHo,Australia,,,228249,view of seashore sunset,Sunset over an Australian Beach,sky
160010,1KJAjCSyi48,https://unsplash.com/photos/1KJAjCSyi48,Brazil,-20.812711,-49.376521,21430,yellow sunflower in tilt shift lens,sunflower,flower
323198,LZMc0dJGRUA,https://unsplash.com/photos/LZMc0dJGRUA,Brazil,-25.304952,-49.054143,4464,silhouette of mountain during sunset,sky,mountain
177204,QTIugFX1Gug,https://unsplash.com/photos/QTIugFX1Gug,Brazil,-23.013488,-43.323730,51768,two people surfing on water,Surfers,nature
174870,QTIugFX1Gug,https://unsplash.com/photos/QTIugFX1Gug,Brazil,-23.013488,-43.323730,51768,two people surfing on water,Surfers,sea


## Table combination

Develop a new table combining the color, keyword, camera and nation features with photo id for further filtering process. 

In [None]:
color_table = datasets['colors'][['photo_id','keyword']]
color_table = color_table.rename(columns={"photo_id": "photo_id", "keyword": "color"})

keyword_table = datasets['conversions'][['photo_id','keyword']]

camera_table = datasets['photos'][['photo_id','exif_camera_make','photo_location_country']]
camera_table['exif_camera_make'] = camera_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [None]:
newkeyword = keyword_table.replace({'keyword':'natural'},{'keyword':'nature'}, regex=True)
newkeyword = newkeyword.replace({'keyword':'flowers'},{'keyword':'flower'}, regex=True)
newkeyword = newkeyword.replace({'keyword':'ocean'},{'keyword':'sea'}, regex=True)

keyword_table = newkeyword[newkeyword.keyword.isin(['nature', 'flower', 'mountain', 'sea', 'sky', 'forest'])]

In [None]:
newcountry = camera_table.dropna(subset = ['photo_location_country'])
newcountry = newcountry.replace({'photo_location_country':'USA'},{'photo_location_country':'United States'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'美國'},{'photo_location_country':'United States'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Deutschland'},{'photo_location_country':'Germany'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Schweiz'},{'photo_location_country':'Switzerland'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Italien'},{'photo_location_country':'Italy'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Italia'},{'photo_location_country':'Italy'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Россия'},{'photo_location_country':'Russia'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Brasil'},{'photo_location_country':'Brazil'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'中国'},{'photo_location_country':'China'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Österreich'},{'photo_location_country':'Austria'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'México'},{'photo_location_country':'Mexico'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Latvija'},{'photo_location_country':'Latvia'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'España'},{'photo_location_country':'Spain'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'Nederland'},{'photo_location_country':'Netherlands'}, regex=True)
newcountry = newcountry.replace({'photo_location_country':'日本'},{'photo_location_country':'Japan'}, regex=True)

camera_table = newcountry
country_name_file = Counter(country_name.flat).most_common(50)
test = np.array(country_name_file)
top_country_file = Counter(test.flat)
test[:,0]

array(['United States', 'Canada', 'Italy', 'United Kingdom', 'Germany',
       'Australia', 'France', 'Iceland', 'Switzerland', 'Austria',
       'Indonesia', 'Spain', 'Brazil', 'Russia', 'New Zealand',
       'Portugal', 'Mexico', 'Netherlands', 'India', 'Norway',
       'South Africa', 'Japan', 'China', 'Latvia', 'Maldives', 'Romania',
       'Greece', 'Iran', 'Poland', 'Sweden', 'Vietnam', 'Thailand',
       'Chile', 'Denmark', 'Ukraine', 'Argentina', 'Turkey', 'Hungary',
       'Singapore', 'Philippines', 'Slovenia', 'Belgium', 'Costa Rica',
       'Croatia', 'Hong Kong', 'Ireland', 'Finland', 'Taiwan', 'Morocco',
       'Slovakia'], dtype='<U14')

In [None]:
camera_table = camera_table[camera_table.photo_location_country.isin(['United States', 
       'Canada', 'Italy', 'United Kingdom', 'Germany',
       'Australia', 'France', 'Iceland', 'Switzerland', 'Austria',
       'Indonesia', 'Spain', 'Brazil', 'Russia', 'New Zealand',
       'Portugal', 'Mexico', 'Netherlands', 'India', 'Norway',
       'South Africa', 'Japan', 'China', 'Latvia', 'Maldives', 'Romania',
       'Greece', 'Iran', 'Poland', 'Sweden', 'Vietnam', 'Thailand',
       'Chile', 'Denmark', 'Ukraine', 'Argentina', 'Turkey', 'Hungary',
       'Singapore', 'Philippines', 'Slovenia', 'Belgium', 'Costa Rica',
       'Croatia', 'Hong Kong', 'Ireland', 'Finland', 'Taiwan', 'Morocco',
       'Slovakia'])]

# Prediction modelling

Count the total images number of each nation

In [None]:
top_colors = ['darkslategray','dimgray','darkolivegreen','black','gray','darkgray']
top_keywords = ['nature', 'flower', 'mountain', 'sea', 'sky', 'forest']
top_cameras = ['canon', 'nikon', 'sony']
infotable = pd.DataFrame()
for col in top_colors:
    for kw in top_keywords:
        for cmr in top_cameras:
            num_photo_1 = color_table[color_table.color.isin([col])]
            num_photo_2 = keyword_table[keyword_table.keyword.isin([kw])]
            num_photo_3 = camera_table[camera_table.exif_camera_make.isin([cmr])]
            combinedata = pd.merge(num_photo_1, num_photo_2, on='photo_id', how='inner')
            combinedata = pd.merge(combinedata, num_photo_3, on='photo_id', how='inner')
            num_photo_4 = combinedata.drop_duplicates(subset=['photo_id'])
            num_photo_5 = num_photo_4[['photo_location_country']].to_numpy().flat
            num_photo_6 = Counter(num_photo_5).most_common(6)
            top_country_name = np.array(num_photo_6)[:,0]
            top_country_number = np.array(num_photo_6)[:,1]
            information = pd.DataFrame({"color": [col], "keyword": [kw], "camera":[cmr],
                                      "Nation1":[top_country_name[0]], "Num1":[top_country_number[0]],
                                      "Nation2":[top_country_name[1]], "Num2":[top_country_number[1]],
                                      "Nation3":[top_country_name[2]], "Num3":[top_country_number[2]],
                                      "Nation4":[top_country_name[3]], "Num4":[top_country_number[3]],
                                      "Nation5":[top_country_name[4]], "Num5":[top_country_number[4]]})
            infotable = infotable.append(information, ignore_index=True)

In [None]:
df = pd.DataFrame.from_dict(top_country_file, orient='index').reset_index()
first_nation = df.rename(columns={'index':'Nation1', 0:'count1'})
first_nation = infotable.merge(first_nation, on='Nation1', how='left')

first_nation['nation1ratio'] = first_nation['Num1'].astype(float)/first_nation['count1']
infotable_update = infotable
infotable_update['nation1ratio'] = first_nation['nation1ratio']

second_nation = df.rename(columns={'index':'Nation2', 0:'count2'})
second_nation = infotable.merge(second_nation, on='Nation2', how='left')
second_nation['nation2ratio'] = second_nation['Num2'].astype(int)/second_nation['count2']
infotable_update['nation2ratio'] = second_nation['nation2ratio']

third_nation = df.rename(columns={'index':'Nation3', 0:'count3'})
third_nation = infotable.merge(third_nation, on='Nation3', how='left')
third_nation['nation3ratio'] = third_nation['Num3'].astype(int)/third_nation['count3']
infotable_update['nation3ratio'] = third_nation['nation3ratio']

fourth_nation = df.rename(columns={'index':'Nation4', 0:'count4'})
fourth_nation = infotable.merge(fourth_nation, on='Nation4', how='left')
fourth_nation['nation4ratio'] = fourth_nation['Num4'].astype(int)/fourth_nation['count4']
infotable_update['nation4ratio'] = fourth_nation['nation4ratio']

fifth_nation = df.rename(columns={'index':'Nation5', 0:'count5'})
fifth_nation = infotable.merge(fifth_nation, on='Nation5', how='left')
fifth_nation['nation5ratio'] = fifth_nation['Num5'].astype(int)/fifth_nation['count5']
infotable_update['nation5ratio'] = fifth_nation['nation5ratio']

infotable_update['sum'] = infotable_update['nation1ratio'] + infotable_update['nation2ratio'] + infotable_update['nation3ratio']

infotable_update['sum'] = infotable_update['sum'] + infotable_update['nation4ratio'] + infotable_update['nation5ratio']
infotable_update['nation1ratio'] = infotable_update['nation1ratio']/infotable_update['sum'] 
infotable_update['nation2ratio'] = infotable_update['nation2ratio']/infotable_update['sum'] 
infotable_update['nation3ratio'] = infotable_update['nation3ratio']/infotable_update['sum'] 
infotable_update['nation4ratio'] = infotable_update['nation4ratio']/infotable_update['sum'] 
infotable_update['nation5ratio'] = infotable_update['nation5ratio']/infotable_update['sum'] 

In [None]:
infotable_update
# In this table, each nation#ratio represents the percentage of possiblity that the taste of correponding color, 
# keyword and camera are more likely to from the corresponding nation#. It is calculated by 
# 1) the image amount of that nation with selected features, divided by the total image amount of that nation.
# 2) calculate the sum of all ratios
# 3) the percentage of each ratio occupied the sum value

Unnamed: 0,color,keyword,camera,Nation1,Num1,Nation2,Num2,Nation3,Num3,Nation4,Num4,Nation5,Num5,nation1ratio,nation2ratio,nation3ratio,nation4ratio,nation5ratio,sum
0,darkslategray,nature,canon,United States,64,Italy,32,Canada,14,Switzerland,8,Austria,8,0.126144,0.351463,0.150254,0.153496,0.218643,0.212729
1,darkslategray,nature,nikon,United States,48,Italy,12,Australia,10,Iceland,8,Canada,7,0.163050,0.227147,0.237583,0.242743,0.129477,0.123433
2,darkslategray,nature,sony,United States,33,Iceland,15,Canada,12,Germany,10,Italy,10,0.094021,0.381751,0.186169,0.179292,0.158766,0.147163
3,darkslategray,flower,canon,United States,15,Germany,7,Taiwan,5,United Kingdom,4,Russia,3,0.021922,0.064379,0.792196,0.032884,0.088619,0.286889
4,darkslategray,flower,nikon,United States,11,Netherlands,4,Romania,3,Iran,2,France,2,0.030827,0.314536,0.351784,0.257073,0.045780,0.149613
5,darkslategray,flower,sony,United States,7,Austria,2,Italy,2,New Zealand,1,China,1,0.067396,0.267007,0.107302,0.205024,0.353271,0.043549
6,darkslategray,mountain,canon,Italy,29,United States,17,Switzerland,10,Austria,9,Canada,8,0.363715,0.038262,0.219099,0.280880,0.098044,0.186292
7,darkslategray,mountain,nikon,United States,15,Italy,11,New Zealand,6,France,5,Norway,2,0.048622,0.198690,0.414154,0.132378,0.206156,0.129352
8,darkslategray,mountain,sony,United States,13,Iceland,4,Germany,4,Switzerland,4,Italy,4,0.096203,0.264414,0.186276,0.288157,0.164950,0.056658
9,darkslategray,sea,canon,United States,21,New Zealand,12,Australia,4,South Africa,4,United Kingdom,3,0.046267,0.562994,0.061638,0.291923,0.037179,0.190309


# Photo Recommendation

In [None]:
top_colors = ['darkslategray','dimgray','darkolivegreen','black','gray','darkgray']
top_keywords = ['nature', 'flower', 'mountain', 'sea', 'sky', 'forest']
top_cameras = ['canon', 'nikon', 'sony']

color_table_1 = color_table[color_table.color.isin(top_colors)]
keyword_table_1 = keyword_table[keyword_table.keyword.isin(top_keywords)]
camera_table_1 = camera_table[camera_table.exif_camera_make.isin(top_cameras)]
combinedata = pd.merge(color_table_1, keyword_table_1, on='photo_id', how='inner')
combinedata = pd.merge(combinedata, camera_table_1, on='photo_id', how='inner')
combinedata = combinedata.drop_duplicates()
#note that after droping duplicates, the remaining photo is not the only one output for the corresponding features.

In [None]:
url_table = datasets['photos'][['photo_id','photo_url','stats_downloads','ai_description','photo_description']]
combinedata = pd.merge(combinedata, url_table, on='photo_id', how='inner')
combinedata

Unnamed: 0,photo_id,color,keyword,exif_camera_make,photo_location_country,photo_url,stats_downloads,ai_description,photo_description
0,LUHnStun6JY,darkolivegreen,nature,canon,United States,https://unsplash.com/photos/LUHnStun6JY,1734,person wearing camouflage pants,doors off helicopter ride at sunset..... price...
1,LUHnStun6JY,gray,nature,canon,United States,https://unsplash.com/photos/LUHnStun6JY,1734,person wearing camouflage pants,doors off helicopter ride at sunset..... price...
2,LUHnStun6JY,darkslategray,nature,canon,United States,https://unsplash.com/photos/LUHnStun6JY,1734,person wearing camouflage pants,doors off helicopter ride at sunset..... price...
3,jJKfRPoy9B0,gray,mountain,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
4,jJKfRPoy9B0,gray,nature,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
5,jJKfRPoy9B0,darkgray,mountain,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
6,jJKfRPoy9B0,darkgray,nature,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
7,jJKfRPoy9B0,darkolivegreen,mountain,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
8,jJKfRPoy9B0,darkolivegreen,nature,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."
9,jJKfRPoy9B0,dimgray,mountain,canon,Italy,https://unsplash.com/photos/jJKfRPoy9B0,3961,white mountains,"Autumn is just around the corner, you can smel..."


In [None]:
color_table_1 = color_table[color_table.color.isin(top_colors)]
keyword_table_1 = keyword_table[keyword_table.keyword.isin(top_keywords)]
combinedata = pd.merge(color_table_1, keyword_table_1, on='photo_id', how='inner')
combinedata = pd.merge(combinedata, camera_table, on='photo_id', how='inner')
combinedata = combinedata.drop_duplicates()
url_table = datasets['photos'][['photo_id','photo_url','stats_downloads','ai_description','photo_description']]
combinedata = pd.merge(combinedata, url_table, on='photo_id', how='inner')
combinedata
rec_photo = combinedata.loc[combinedata.groupby(['color',
       'photo_location_country','keyword'])['stats_downloads'].idxmax()]
rec_photo = rec_photo[rec_photo.photo_location_country.isin(['United States', 
        'Italy',  'France'])]

rec_photo.shape

In [None]:
combinedata.to_csv(r'photoRecommendation.csv', index = False)
infotable_update.to_csv(r'predictionModel.csv', index = False)
featured_photo.to_csv(r'featuredPhoto.csv', index = False)
keyword_photo.to_csv(r'keywordFeatured.csv', index = False)

# Creating Barplots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from subprocess import check_output,CalledProcessError
from bs4 import BeautifulSoup
import requests

In [None]:
df= pd.read_csv('predictionModel.csv')

for i in range(len(df)):
    l = df['color'][i]
    j = df['keyword'][i]
    k = df['camera'][i]
    x = []
    y = []
    x.append(df['Nation1'][i])
    x.append(df['Nation2'][i])
    x.append(df['Nation3'][i])
    x.append(df['Nation4'][i])
    x.append(df['Nation5'][i])
    y.append(float(df['nation1ratio'][i]))
    y.append(float(df['nation2ratio'][i]))
    y.append(float(df['nation3ratio'][i]))
    y.append(float(df['nation4ratio'][i]))
    y.append(float(df['nation5ratio'][i]))
    
    plt.rcParams.update({'font.size': 20})
    fig, ax = plt.subplots(figsize=(10,6))
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.bar(x,y,color ="#1e467d")
    ax.set_ylim(top=1.0)
    ax.set_ylabel('Probability')
    for index, value in enumerate(y):
        plt.text(index-0.3, value+0.03, str(round(value,2)))
    plt.xticks(rotation=60)
    plt.tight_layout()
    plt.savefig("barplots/"+str(l)+"_"+str(j)+"_"+str(k)+'.png',dpi=1200)


# Download Unsplash Images

In [None]:
df1 = pd.read_csv('recommandPhotoHighest.csv')
c_l = ['France','Italy','United States']
import os
arr = os.listdir()
for i in range(len(df1)):
    url = df1['photo_url'][i]
    
    l = str(df1['color'][i])
    j = str(df1['keyword'][i])
#     k = str(df1['exif_camera_make'][i])
    if str(df1['photo_location_country'][i]) in c_l:
        o = "_".join(str(df1['photo_location_country'][i]).split())
    else:
        continue

    r = requests.get(url)
    page = BeautifulSoup(r.text)
    imglist = page.find_all("img")
    srcset = imglist[2]['srcset']
    picurl = srcset.split()[0]
    check_output("curl '"+picurl+"' -o "+l+"_"+j+"_"+o+".jpeg", shell=True)