Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

mega ultra fancy #3

Closed
wants to merge 1 commit into from

1 participant

thebluescreenofdeath
thebluescreenofdeath

No description provided.

thebluescreenofdeath thebluescreenofdeath deleted the branch
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 1 unique commit by 1 author.

May 11, 2012
thebluescreenofdeath thebluescreenofdeath mega ultra fancy 49be320
This page is out of date. Refresh to see the latest.

Showing 1 changed file with 87 additions and 60 deletions. Show diff stats Hide diff stats

  1. +87 60 match.py
147 match.py
@@ -9,7 +9,7 @@
9 9 ### write to csv
10 10
11 11 def write_to_csv(target_file, output):
12   - with open(target_file, 'w', newline='', encoding='latin-1') as f:
  12 + with open(target_file, 'a', newline='', encoding='latin-1') as f:
13 13 csvwriter = csv.writer(f, delimiter=';')
14 14 csvwriter.writerows(output)
15 15
@@ -24,14 +24,13 @@ def create_dictionary_from_csv(csvfile):
24 24 reader = csv.reader(open(csvfile, encoding='latin-1'), delimiter=';')
25 25 lookup_dict = {}
26 26 for line in reader:
27   - key_value = {line[1]:[line[2],line[3],line[0]]}
28 27 if line[1] in lookup_dict and line[2] in lookup_dict[line[1]][0]:
29 28 lookup_dict[line[1]].append(line[0])
30   - else:
31   - key_value = {line[1]:[line[2],line[3],line[0]]}
  29 + else :
  30 + key_value = {line[1]:[line[2],line[3]+' '+line[4],line[0]]}
32 31 lookup_dict.update(key_value)
33 32 return lookup_dict
34   -
  33 +
35 34 def create_searchword_list(csvfile):
36 35 reader = csv.reader(open(csvfile, encoding='latin-1'), delimiter=';')
37 36 searchword_list = []
@@ -58,11 +57,16 @@ def vlookup_similar(lookup_list, lookup_dictionary):
58 57 match_start = datetime.now()
59 58 precise_file = 'searchphrases_fine_matching.csv'
60 59 raw_match_file = 'searchphrases_raw_matching.csv'
  60 + brand_category_match_file = 'searchphrases_brand_category_matching.csv'
  61 + raw_category_match_file = 'searchphrases_raw_category_matching.csv'
61 62 header = [["SKU", "Match Type", "Match Count", "Search Volume", "Search Phrase", "Match Array", "Product Description", "Brand & Category"]]
62 63 write_to_csv(precise_file, header)
63 64 write_to_csv(raw_match_file, header)
  65 + write_to_csv(brand_category_match_file, header)
  66 + write_to_csv(raw_category_match_file, header)
64 67 counter = 0
65 68 for searchphrase in lookup_list:
  69 + counter = counter + 1
66 70 if counter >= 100:
67 71 counter = 0
68 72 match_stop = datetime.now()
@@ -75,27 +79,27 @@ def vlookup_similar(lookup_list, lookup_dictionary):
75 79 split_name = []
76 80 split_name = str.split(description)
77 81 for partial_description in split_name:
78   - description_match_ratio = difflib.SequenceMatcher(None, searchword, partial_description).real_quick_ratio()
  82 + description_match_ratio = difflib.SequenceMatcher(None, searchword, partial_description).ratio()
79 83 if description_match_ratio > 0.85:
80 84 # Found SKU
81 85 for key, value in searchphrase[0].items():
82 86 split_category = []
83 87 split_category = str.split(brand_category_sku[1])
84 88 for partial_category in split_category:
85   - category_match_ratio = difflib.SequenceMatcher(None, key, partial_category).real_quick_ratio()
  89 + category_match_ratio = difflib.SequenceMatcher(None, key, partial_category).ratio()
86 90 if category_match_ratio > 0.75:
87 91 match_dbc["category"][0] = 1
88 92 match_dbc["category"].append(key)
89 93 split_brand = []
90 94 split_brand = str.split(brand_category_sku[0])
91 95 for partial_brand in split_brand:
92   - brand_match_ratio = difflib.SequenceMatcher(None, key, partial_brand).real_quick_ratio()
  96 + brand_match_ratio = difflib.SequenceMatcher(None, key, partial_brand).ratio()
93 97 if brand_match_ratio > 0.75:
94 98 if not key in match_dbc["category"][1:]:
95 99 match_dbc["brand"][0] = 1
96 100 match_dbc["brand"].append(key)
97 101 for partial_description in split_name:
98   - description_match_ratio = difflib.SequenceMatcher(None, key, partial_description).real_quick_ratio()
  102 + description_match_ratio = difflib.SequenceMatcher(None, key, partial_description).ratio()
99 103 if description_match_ratio > 0.85:
100 104 if not key in match_dbc["category"][1:] and not key in match_dbc["brand"][1:]:
101 105 match_dbc["description"][0] = 1
@@ -125,69 +129,92 @@ def vlookup_similar(lookup_list, lookup_dictionary):
125 129 counter = counter + 1
126 130 else:
127 131 pass
128   - ### No Product Match ###
  132 + ### No Product Match ###
129 133 # Check Category & Brand!
130 134 if match_dbc == { "description":[0], "brand":[0], "category":[0] }:
131   - split_category = []
132   - split_category = str.split(brand_category_sku[1])
133   - for partial_category in split_category:
134   - category_match_ratio = difflib.SequenceMatcher(None, key, partial_category).real_quick_ratio()
135   - if category_match_ratio > 0.75:
136   - match_dbc["category"][0] = 1
137   - match_dbc["category"].append(key)
138   - split_brand = []
139   - split_brand = str.split(brand_category_sku[0])
140   - for partial_brand in split_brand:
141   - brand_match_ratio = difflib.SequenceMatcher(None, key, partial_brand).real_quick_ratio()
142   - if brand_match_ratio > 0.75:
143   - if not key in match_dbc["category"][1:]:
144   - match_dbc["brand"][0] = 1
145   - match_dbc["brand"].append(key)
146   - if match_dbc["description"][0] == 1 and match_dbc["category"][0] == 1 and match_dbc["brand"][0] == 1:
147   - match_type = "brand, category & description"
148   - if match_dbc["description"][0] == 1 and match_dbc["category"][0] == 0 and match_dbc["brand"][0] == 1:
149   - match_type = "brand & description"
150   - if match_dbc["description"][0] == 1 and match_dbc["category"][0] == 1 and match_dbc["brand"][0] == 0:
151   - match_type = "category & description"
152   - if match_dbc["description"][0] == 1 and match_dbc["category"][0] == 0 and match_dbc["brand"][0] == 0:
153   - match_type = "only description"
154   - if ((len(match_dbc["description"]) - 1) + (len(match_dbc["brand"]) - 1) + (len(match_dbc["category"]) - 1) ) >= len(searchphrase[0]):
155   - match_count = "complete match"
156   - else:
157   - match_count = "incomplete match"
158   - searched_words = []
159   - for word, empty in searchphrase[0].items():
160   - searched_words.append(word)
161   - final_phrase = ' '.join(searched_words)
162   - output = [[brand_category_sku[2], match_type, match_count, searchphrase[1], final_phrase, match_dbc, description, brand_category_sku[:2]]]
163   - if match_type == "brand, category & description" or match_type == "brand & description":
164   - write_to_csv(precise_file, output)
  135 + for key, value in searchphrase[0].items():
  136 + split_category = []
  137 + split_category = str.split(brand_category_sku[1])
  138 + for partial_category in split_category:
  139 + category_match_ratio = difflib.SequenceMatcher(None, key, partial_category).ratio()
  140 + if category_match_ratio > 0.75:
  141 + match_dbc["category"][0] = 1
  142 + match_dbc["category"].append(key)
  143 + split_brand = []
  144 + split_brand = str.split(brand_category_sku[0])
  145 + for partial_brand in split_brand:
  146 + brand_match_ratio = difflib.SequenceMatcher(None, key, partial_brand).ratio()
  147 + if brand_match_ratio > 0.75:
  148 + if not key in match_dbc["category"][1:]:
  149 + match_dbc["brand"][0] = 1
  150 + match_dbc["brand"].append(key)
  151 + if not match_dbc == { "description":[0], "brand":[0], "category":[0] }:
  152 + if match_dbc["category"][0] == 1 and match_dbc["brand"][0] == 1:
  153 + match_type = "brand & category"
  154 + if match_dbc["category"][0] == 0 and match_dbc["brand"][0] == 1:
  155 + match_type = "brand"
  156 + if match_dbc["category"][0] == 1 and match_dbc["brand"][0] == 0:
  157 + match_type = "category"
  158 + if ((len(match_dbc["brand"]) - 1) + (len(match_dbc["category"]) - 1)) >= len(searchphrase[0]):
  159 + match_count = "complete match"
  160 + else:
  161 + match_count = "incomplete match"
  162 + searched_words = []
  163 + for word, empty in searchphrase[0].items():
  164 + searched_words.append(word)
  165 + final_phrase = ' '.join(searched_words)
  166 + output = [[brand_category_sku[2], match_type, match_count, searchphrase[1], final_phrase, match_dbc, description, brand_category_sku[:2]]]
  167 + if match_type == "brand & category":
  168 + write_to_csv(brand_category_match_file, output)
  169 + else:
  170 + write_to_csv(raw_category_match_file, output)
  171 +
165 172 else:
166   - write_to_csv(raw_match_file, output)
167   - counter = counter + 1
168   - else:
169   - pass
  173 + pass
170 174 else:
171 175 pass
172 176
  177 +def create_output_dictionaries_from_csv(csvfile):
  178 + reader = csv.reader(open(csvfile, encoding='latin-1'), delimiter=';')
  179 + output_dict = {}
  180 + for line in reader:
  181 + key_value = {line[4]:line[1]}
  182 +# print(key_value)
  183 + output_dict.update(key_value)
  184 +# print(output_dict)
  185 + return output_dict
173 186
174   -def find_residuals(search_input, product_output, category_output, brand_output, residuals_csv):
175   - pass
176   - # map each output into dictionary with search phrase as key
177   - # loop through search phrases
178   - # find key in search phrases of matched products
179   - # break
180   - # find key in search phrase of matched categories
181   - # break
182   - # find key in search phrase of matched brands
183   - # break
184   - # output residuals
  187 +def find_residuals(search_input):
  188 + residuen_csv = 'residuen_list.csv'
  189 + searchphrases_fine_matching_dict = create_output_dictionaries_from_csv('searchphrases_fine_matching.csv')
  190 + searchphrases_raw_category_matching_dict = create_output_dictionaries_from_csv('searchphrases_raw_category_matching.csv')
  191 + searchphrases_brand_category_matching_dict = create_output_dictionaries_from_csv('searchphrases_brand_category_matching.csv')
  192 + searchphrases_raw_matching_dict = create_output_dictionaries_from_csv('searchphrases_raw_matching.csv')
185 193
  194 + for searchphrase in lookup_list:
  195 + try:
  196 + output = searchphrases_fine_matching_dict[searchphrase]
  197 + except:
  198 + try:
  199 + output = searchphrases_raw_matching_dict[searchphrase]
  200 + except:
  201 + try:
  202 + output = searchphrases_brand_category_matching_dict[searchphrase]
  203 + except:
  204 + try:
  205 + output = searchphrases_raw_category_matching_dict[searchphrase]
  206 + except:
  207 + output = "Residuals"
  208 + write_to_csv([searchphrase, output], residuen_csv)
186 209
187   -start = datetime.now()
188 210 lookup_dict = create_dictionary_from_csv(lookup_csv)
189 211 searchword_list = searchword_searchphrase_dict(to_match_csv)
  212 +start = datetime.now()
190 213 vlookup_similar(searchword_list, lookup_dict)
  214 +find_residuals(lookup_list, )
191 215 stop = datetime.now()
192 216 result = stop - start
193 217 print(result)
  218 +
  219 +
  220 +

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.