In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext, SparkConf

In [3]:
conf = SparkConf().setAppName("RDDJoin").setMaster("local[4]")

In [4]:
sc = SparkContext(conf=conf)

# VERİ OKUMA SAFHASI

In [5]:
# order_items okuma ve başlıktan kurtulma
order_items_rdd = sc.textFile("D:/Datasets/retail_db/order_items.csv") \
.filter(lambda x: "orderItemName" not in x) \
.repartition(4)

In [6]:
order_items_rdd.take(5)

['11,5,1014,2,99.96,49.98',
 '12,5,957,1,299.98,299.98',
 '13,5,403,1,129.99,129.99',
 '14,7,1073,1,199.99,199.99',
 '15,7,957,1,299.98,299.98']

In [7]:
# products okuma ve başlıktan kurtulma
products_rdd = sc.textFile("D:/Datasets/retail_db/products.csv") \
.filter(lambda x: "productDescription" not in x) \
.repartition(4)

In [8]:
products_rdd.take(5)

['11,2,Fitness Gear 300 lb Olympic Weight Set,,209.99,http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set',
 "12,2,Under Armour Men's Highlight MC Alter Ego Fla,,139.99,http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Alter+Ego+Flash+Football...",
 "13,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat",
 '14,2,Quik Shade Summit SX170 10 FT. x 10 FT. Canop,,199.99,http://images.acmesports.sports/Quik+Shade+Summit+SX170+10+FT.+x+10+FT.+Canopy',
 "15,2,Under Armour Kids' Highlight RM Alter Ego Sup,,59.99,http://images.acmesports.sports/Under+Armour+Kids%27+Highlight+RM+Alter+Ego+Superman+Football..."]

# OKUNAN VERİLERİ PAIR RDD'ye ÇEVİRME SAFHASI

In [9]:
# order_items pair_rdd yapma
def make_order_items_pair_rdd(line):
    orderItemName = line.split(",")[0]
    orderItemOrderId = line.split(",")[1]
    orderItemProductId = line.split(",")[2]
    orderItemQuantity = line.split(",")[3]
    orderItemSubTotal = line.split(",")[4]
    orderItemProductPrice = line.split(",")[5]
    
    return (orderItemProductId, (orderItemName, orderItemOrderId, orderItemQuantity, 
                                 orderItemSubTotal,orderItemProductPrice))

In [10]:
order_item_pair_rdd = order_items_rdd.map(make_order_items_pair_rdd)

In [11]:
order_item_pair_rdd.take(5)

[('1014', ('11', '5', '2', '99.96', '49.98')),
 ('957', ('12', '5', '1', '299.98', '299.98')),
 ('403', ('13', '5', '1', '129.99', '129.99')),
 ('1073', ('14', '7', '1', '199.99', '199.99')),
 ('957', ('15', '7', '1', '299.98', '299.98'))]

In [12]:
# products için pair rdd yapma
def make_products_pair_rdd(line):
    productId = line.split(",")[0]
    productCategoryId = line.split(",")[1]
    productName = line.split(",")[2]
    productDescription = line.split(",")[3]
    productPrice = line.split(",")[4]
    productImage = line.split(",")[5]
    
    return (productId,(productCategoryId, productName, productDescription, productPrice, productImage))

In [13]:
products_pair_rdd = products_rdd.map(make_products_pair_rdd)

In [14]:
products_pair_rdd.take(5)

[('11',
  ('2',
   'Fitness Gear 300 lb Olympic Weight Set',
   '',
   '209.99',
   'http://images.acmesports.sports/Fitness+Gear+300+lb+Olympic+Weight+Set')),
 ('12',
  ('2',
   "Under Armour Men's Highlight MC Alter Ego Fla",
   '',
   '139.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Highlight+MC+Alter+Ego+Flash+Football...')),
 ('13',
  ('2',
   "Under Armour Men's Renegade D Mid Football Cl",
   '',
   '89.99',
   'http://images.acmesports.sports/Under+Armour+Men%27s+Renegade+D+Mid+Football+Cleat')),
 ('14',
  ('2',
   'Quik Shade Summit SX170 10 FT. x 10 FT. Canop',
   '',
   '199.99',
   'http://images.acmesports.sports/Quik+Shade+Summit+SX170+10+FT.+x+10+FT.+Canopy')),
 ('15',
  ('2',
   "Under Armour Kids' Highlight RM Alter Ego Sup",
   '',
   '59.99',
   'http://images.acmesports.sports/Under+Armour+Kids%27+Highlight+RM+Alter+Ego+Superman+Football...'))]

# JOIN AŞAMASI

In [15]:
order_items_product_pair_rdd = order_item_pair_rdd.join(products_pair_rdd)

In [16]:
order_items_product_pair_rdd.take(5)

[('957',
  (('12', '5', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('15', '7', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('59', '19', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('94', '34', '1', '299.98', '299.98'),
   ('43',
    "Diamondback Women's Serene Classic Comfort Bi",
    '',
    '299.98',
    'http://images.acmesports.sports/Diamondback+Women%27s+Serene+Classic+Comfort+Bike+2014'))),
 ('957',
  (('97', '36', '1', '299.98', '299.98'),
   ('43',
    "Diam

In [None]:
# sc.stop()