-
Notifications
You must be signed in to change notification settings - Fork 416
/
imdb.celeb.birthdays.xml
147 lines (139 loc) · 5.52 KB
/
imdb.celeb.birthdays.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
<?xml version="1.0" encoding="UTF-8" ?>
<table xmlns="http://query.yahooapis.com/v1/schema/table.xsd">
<meta>
<sampleQuery> select * from {table}</sampleQuery>
</meta>
<bindings>
<select itemPath="birthdays.person" produces="XML">
<urls>
<url></url>
</urls>
<!-- since we can't match paging to the underlying BOSS paging model precisely, we'll just claim we can do fixed start (0) with variable
number of results -->
<paging model="offset">
<pagesize id="count" max="300" />
<total default="10" />
</paging>
<inputs>
<key id='date' type='xs:string' paramType='variable' />
</inputs>
<execute><![CDATA[
//object to query imdb to extract bio info for a person
var celebInfo = function(name,url) {
this.url = url;
this.name = name;
var querystring = "select * from html where url = '"+url+"' and xpath=\"//div[@id='tn15']\"";
this.query = y.query(querystring);
}
//actually extract the info and return an xml object
celebInfo.prototype.getData=function() {
default xml namespace ='';
var d = this.query.results;
var img = d..div.(@["id"]=="tn15lhs").div.a.img;
var content = d..div.(@['id'] =="tn15content");
var bio = "";
//this is pretty hacky
for each (var node in content.p) {
if (node.text().toString().trim().length>100) {
bio = node.*;
break;
}
}
var anchors = content.a;
var bornInYear = null;
var bornWhere = null;
var diedInYear = null;
var onThisDay = [];
//TODO see if there is a wildcard way of pulling these out using e4x/xpath
for each (var a in anchors) {
var href = a.@['href'].toString();
if (href.indexOf("/BornInYear")==0) {
bornInYear = a.toString().trim();
continue;
}
if (href.indexOf("/DiedInYear")==0) {
diedInYear = a.toString().trim();
continue;
}
if (href.indexOf("/BornWhere")==0) {
bornWhere = a.toString().trim();
continue;
}
if (href.indexOf("/OnThisDay")==0) {
onThisDay.push(a.text().toString().trim());
continue;
}
}
var bornDayMonth=null;
var diedDayMonth=null;
if (onThisDay.length>0) {
bornDayMonth = onThisDay[0].replace(/^\s*(\d{1,2})[\s]+(\w+)\s*/,'$1 $2'); //tidy up whitespace around text
if (diedInYear && onThisDay.length>1) {
diedDayMonth= onThisDay[1].replace(/^\s*(\d{1,2})[\s]+(\w+)\s*/,'$1 $2'); //tidy up whitespace around text
}
}
var url = this.url;
var name = this.name;
var bornTime = null;
if (bornDayMonth) {
var daymonth = bornDayMonth.split(" ");
bornTime=new Date(bornInYear,Date.getMonthFromString(daymonth[1]),parseInt(daymonth[0])).getTime()/1000;
}
var diedTime = null;
if (diedDayMonth) {
var daymonth = diedDayMonth.split(" ");
diedTime=new Date(diedInYear,Date.getMonthFromString(daymonth[1]),parseInt(daymonth[0])).getTime()/1000;
}
var person = <person url={url}><name>{name}</name>{img}<born utime={bornTime}>{bornDayMonth} {bornInYear}</born></person>;
if (diedTime) person.person+=<died utime={diedTime}>{diedDayMonth} {diedInYear}</died>;
if (bio) person.person+=<bio>{bio}</bio>;
return person;
}
//general useful routines
String.prototype.trim =function() {
return this.replace(/^[\s]*/,'').replace(/[\s]*$/,'');
}
Date.getMonthFromString = function(month) {
return {'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6, 'August':7, 'September':8, 'October':9, 'November':10, 'December':11}[month];
}
Date.prototype.getMonthName = function() {
return ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][this.getMonth()];
}
//the main object that uses boss to get the list (also gets peoples "death" days too)
celebSearch = function(when,start,count) {
//search yahoo/boss using the current day and month only on bio pages on imdb
var bornDayMonth = when.getDate()+" "+when.getMonthName();
var ud = Math.round(when.getTime()/1000);
var search = 'site:www.imdb.com "Date of birth" "'+bornDayMonth+'" title:biography'
var query = "select * from search.web("+start+","+count+") where query='"+search+"'";
var celebs = y.query(query).results;
//go through each result and start to get the persons name and their imdb info page out
var results = [];
default xml namespace ='http://www.inktomi.com/'; //make sure our e4x is in the right namespace. IMPORTANT
for each (var celeb in celebs.result) {
//discard any hits on the date of death that also match in our yahoo search
//(this is going to hurt our paging)
if (celeb["abstract"].toString().indexOf("<b>Date of Birth</b>. <b>"+bornDayMonth)<0) continue;
var j = celeb.title.toString().indexOf("-"); //use text up to "dash" from title for name
var name = celeb.title.toString().substring(0,j).trim();
//start parsing these entries by pulling from imdb directly
results.push(new celebInfo(name,celeb.url));
}
//loop through each imdb fetch result, and create the result object
default xml namespace = '';
var data = <birthdays utime={ud} date={when} />;
for each (var celeb in results) {
data.birthdays+=celeb.getData();
}
return data;
}
//run it for today if no date was provided
var when = new Date();
if (date && date.length>0) {
when = new Date(date); //TODO needs a well formed date including year
}
response.object = new celebSearch(when,0,count);
]]></execute>
</select>
</bindings>
</table>