# FILTER() REMOVES DATA FROM YOUR RDD
- Just takes a function that returns a boolean
- For example, we want to filter out entries that don't have 'TMIN' inthe first item of a list of data:
```
minTemps = parsedLines.filter(lambda x: 'TMIN' in x[1])
```


###### Input data snippet:
```
ITE00100554,18000101,TMAX,-75,,,E,
```
wether_station_id, timestamp, observation type

In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setMaster('local').setAppName('MinTemperatures')
sc = SparkContext(conf=conf)

In [11]:
def parse_line(line):
    fields = line.split(',')
    station_id, entry_type = fields[0], fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0/5.0) + 32.0
    return (station_id, entry_type, temperature)

In [12]:
line = sc.textFile('1800.csv')
parsed_line = line.map(parse_line)
min_temps = parsed_line.filter(lambda x: 'TMIN' in x[1])
station_temps = min_temps.map(lambda x : (x[0], x[2]))
min_temps = station_temps.reduceByKey(lambda x,y: min(x,y))
results = min_temps.collect()

In [13]:
for (num, result) in enumerate(results):
    print(num, " ", result[0] , "\t{:.2f}F".format(result[1]))

0   ITE00100554 	5.36F
1   EZE00100082 	7.70F


#### Now find the maximum temperature

In [14]:
max_temps = parsed_line.filter(lambda x: 'TMAX' in x[1])
station_temps = max_temps.map(lambda x: (x[0], x[2]))
max_temps = station_temps.reduceByKey(lambda x,y: max(x, y))
results = max_temps.collect()

In [15]:
for (num, result) in enumerate(results):
    print(num, " ", result[0] , "\t{:.2f}F".format(result[1]))

0   ITE00100554 	90.14F
1   EZE00100082 	90.14F
